CRAB/python/PubDB.py

#!/usr/bin/env python2
import sys, os, string, re
import urllib, urllister
import urllib2
import common
from RefDBInfo import RefDBInfo
from PubDBInfo import *

# ####################################
class PubDBError:
    def __init__(self, url):
        print '\nERROR accessing PubDB at '+url+'\n'
        pass

# ####################################
class PubDBGetAnalysisError:
  def __init__(self, url,Collections):
    print '\nERROR extracting info for collections '+Collections+' from PubDB '+url+'.\n'
    pass
  
# ####################################
class RefDBmapError:
    def __init__(self, url):
        print '\nERROR accessing RefDB-PubDBs map at '+url+'\n'
        pass 

# ####################################
class NoPHPError:
    def __init__(self, url):
        #print '\nERROR accessing PHP at '+url+' \n'
        print 'ERROR accessing PHP: ',url,' \n'
        pass
  
# ####################################
class pubDBResult:
    def __init__(self,
                 contents):
        self.contents=contents

    
    def dump(self):
        print 'Contents : ',self.contents
        pass

# ####################################
# class to access PubDBs
class PubDB:
    def __init__(self, owner, dataset, dataTiers, cfg_params):

#       Attributes
        self.owner = owner
        self.dataset = dataset
        self.dataTiers = dataTiers
        self.NeededdataTiers=[]
        self.cfg_params = cfg_params
    
        self.RefDBurl_ = 'http://cmsdoc.cern.ch/cms/production/www/'
        self.RefDBphp_ = 'PubDB/GetIdCollection.php'
        self.RefDBMotherphp_ = 'cgi/SQL/CollectionTree.php'

        self.PubDBCentralUrl_ = 'http://cmsdoc.cern.ch/cms/production/www/PubDB/'
        self.PubDBCentralPhp_ = 'GetPublishedCollectionInfoFromRefDB.php'

        self.PubDBAnalysisPhp_ = 'get-pubdb-analysisinfo.php'
        self.PubDBAnalysisPhpOld_ = 'get-pubdb-analysisinfo.php'
    
##      link to the modified RefDB-PubDBs map script that allow the display option
        self.RefDBPubDBsmapPhp_ = 'GetPublishedCollectionInfoFromRefDB.mod.php?display=1'

#       Costructor procedures

        CEBlackList = []
        try:
            tmpBad = string.split(self.cfg_params['EDG.ce_black_list'],',')
            #tmpBad = ['fnal']
            for tmp in tmpBad:
                tmp=string.strip(tmp)
                if (tmp == 'cnaf'): tmp = 'webserver' ########## warning: temp. patch              
                CEBlackList.append(tmp)
        except KeyError:
            pass

        CEWhiteList = []
        try:
            tmpGood = string.split(self.cfg_params['EDG.ce_white_list'],',')
            #tmpGood = ['cern']
            for tmp in tmpGood:
                tmp=string.strip(tmp)
                #if (tmp == 'cnaf'): tmp = 'webserver' ########## warning: temp. patch
                CEWhiteList.append(tmp)
        except KeyError:
            pass

        #print 'CEWhiteList: ',CEWhiteList
        self.reCEWhiteList=[]
        for Good in CEWhiteList:
            self.reCEWhiteList.append(re.compile( Good ))
        #print 'ReGood: ',self.reCEWhiteList

        common.logger.debug(5,'CEBlackList: '+str(CEBlackList))
        common.logger.debug(5,'CEWhiteList: '+str(CEWhiteList))
        self.reCEBlackList=[]
        for bad in CEBlackList:
            self.reCEBlackList.append(re.compile( bad ))
        #print 'ReBad: ',self.reCEBlackList


########################################################################
    def findAllCollections(self):
        """
        Contact RefDB and find the CollID of all the user required collections 
        """
        ## download from RefDB all the info about the given dataset-owner  
        refdb=RefDBInfo(self.owner,self.dataset)
        #print refdb.GetRefDBInfo()
        try:
            collInfos=refdb.GetRefDBInfo()
        except :
            sys.exit(10)
        #print "collInfos=", collInfos 
        
        first=1
        NeededCollID=[]
        refdbdataTiers=[]
        for coll in collInfos:
            ## select the primary collection
            if first:
                NeededCollID.append(coll[0])
                self.NeededdataTiers.append(coll[2])
                refdbdataTiers.append(coll[2])
                common.logger.message("\n --> primary collection for owner "+self.owner+" is: ID="+coll[0]+" DataTier="+coll[2])
                first=0
            else:
                ## select only the parents collections corresponding to data-tiers requested by the user 
                if  self.dataTiers.count(coll[2]):
                    NeededCollID.append(coll[0])
                    self.NeededdataTiers.append(coll[2])
                    common.logger.message(" --> further collection required: ID="+coll[0]+" DataTier="+coll[2])
                refdbdataTiers.append(coll[2])
           
        ## check that the user asks for Data Tier really existing in RefDB, otherwise give a warning message
        for dt in self.dataTiers:
            if refdbdataTiers.count(dt)<=0:
                msg = "ERROR: Data Tier ( =>",dt,"<= ) not existing for dataset/owner "+self.dataset+"/"+self.owner+"! "
                msg = str(msg) + 'Owner Dataset not published with asked dataTiers! '+\
                       self.owner+' '+ self.dataset+' '+str(self.dataTiers)+'\n'
                msg = str(msg) + ' Check the data_tier variable in crab.cfg !\n'
                common.logger.message(msg) 
                return []
        
        #print 'Needed Collections are ', NeededCollID
        #return collInfos
        #print "NeededCollID= ", NeededCollID
        return NeededCollID
  
########################################################################
    def findPubDBsbyCollID(self,CollID):
        """
         Find the list of PubDB URLs having a given Collection 
        """
        ### contact the RefDB-PubDBs map to discovery where the given CollID is
        url = self.PubDBCentralUrl_+self.RefDBPubDBsmapPhp_+'&CollID=' + CollID
        # print "%s"%(url)
        try:
            f = urllib.urlopen(url)
        except IOError:
            # print 'Cannot access URL: '+url
            raise RefDBmapError(url)
        
        ### search for the PubDBURL string
        reURLLine=re.compile( r'PubDBURL=(\S*)' )
       
        PubDBURLs = []
        for line in f.readlines():
            #print '#',line,'#'
            if reURLLine.search(line) :
                URLLine=reURLLine.search(line).group()
                #print  string.split(URLLine,'=')[1]
                PubDBURLs.append(string.split(URLLine,'=')[1])
        
        ### return the list of PubDBURL where the collection is present
        #return PubDBURLs 
        return  self.uniquelist(PubDBURLs)
  
################################################################
    def findPubDBs(self,CollIDs):
        """
         Find the list of PubDB URLs having ALL the required collections
        """
        ### loop over all the required collections 
        #pubdbmap={}
        allurls=[]
        countColl=0
        for CollID in CollIDs :
            countColl=countColl+1
            ### map the CollectionID with the list of PubDB URLs
            #pubdbmap[CollID]=self.findPubDBsbyCollID(CollID)
            ### prepare a list all PubDB urls for all collections  
            allurls.extend(self.findPubDBsbyCollID(CollID))
        #print pubdbmap.values()
       
        ### select only PubDB urls that contains all the collections
        unique_urls=self.uniquelist(allurls)
        SelectedPubDBURLs=[]
        # loop on a unique list of PubDB urls
        for url in unique_urls :
            # check that PubDBurl occurrance is the same as the number of collections 
            if ( allurls.count(url)==countColl ) :
                SelectedPubDBURLs.append(url)
        common.logger.debug(5,'PubDBs '+str(SelectedPubDBURLs))
       
        #print 'Required Collections',CollIDs,'are all present in PubDBURLs : ',SelectedPubDBURLs,'\n'
        ####  check based on CE black list: select only PubDB not in the CE black list   
        tmp=self.checkBlackList(SelectedPubDBURLs)
        common.logger.debug(5,'PubDBs after black list '+str(tmp))

        ### check based on CE white list: select only PubDB defined by user
        GoodPubDBURLs=self.checkWhiteList(tmp)
        if len(GoodPubDBURLs)>0 :
         common.logger.debug(5,'PubDBs after white list '+str(GoodPubDBURLs))
         common.logger.debug(3,'Selected sites via PubDB URLs are '+str(GoodPubDBURLs))
        return GoodPubDBURLs

#######################################################################
    def uniquelist(self, old):
        """
        remove duplicates from a list
        """
        nd={}
        for e in old:
            nd[e]=0
        return nd.keys()
 
#######################################################################
    def checkWhiteList(self, pubDBUrls):
        """
        select PubDB URLs that are at site defined by the user (via CE white list)
        """
        if len(self.reCEWhiteList)==0: return pubDBUrls
        goodurls = []
        for url in pubDBUrls:
            #print 'connecting to the URL ',url
            good=0
            for re in self.reCEWhiteList:
                if re.search(url):
                    common.logger.debug(5,'CE in white list, adding PubDB URL '+url)
                    good=1
                if not good: continue
                goodurls.append(url)
        if len(goodurls) == 0:
            common.logger.message("No sites found via PubDB \n")
        else:
            common.logger.debug(5,"Selected sites via PubDB URLs are "+str(goodurls)+"\n")
        return goodurls

#######################################################################
    def checkBlackList(self, pubDBUrls):
        """
        select PubDB URLs that are at site not exluded by the user (via CE black list) 
        """
        if len(self.reCEBlackList)==0: return pubDBUrls
        goodurls = []
        for url in pubDBUrls:
            common.logger.debug(10,'connecting to the URL '+url)
            good=1
            for re in self.reCEBlackList:
                if re.search(url):
                    common.logger.message('CE in black list, skipping PubDB URL '+url)
                    good=0
                pass
            if good: goodurls.append(url)
        if len(goodurls) == 0:
            common.logger.debug(3,"No sites found via PubDB")
        return goodurls

########################################################################
    def checkPubDBNewVersion(self, baseurl):
        """
        Check PubDB version to find out if it's new-style or old-style
        """
### check based on the existance of pubdb-get-version.php
        urlversion=baseurl+'pubdb-get-version.php'
        newversion=1;
        try:
         v = urllib2.urlopen(urlversion)
        except urllib2.URLError, msg:
          #print "WARNING: no URL to get PubDB version "
          newversion=0;
      
        if (newversion) :
         schemaversion = v.read()
         #print schemaversion;
   
        return newversion 

########################################################################
    def getPubDBData(self, CollIDs, url , newversion):
        """
         Contact a PubDB to collect all the relevant information
        """
        result = []
        
### get the base PubDb url 
        end=string.rfind(url,'/')
        lastEq=string.rfind(url,'=')

        if (newversion) :
### from PubDB V4 : get info for all the collections in one shot and unserialize the content
           Collections=string.join(CollIDs,'-')
           ## add the PU among the required Collections if the Digi are requested
           # ( for the time being asking it directly to the PubDB so the RefDB
           # level data discovery is bypassed..... in future when every site
           # will have the new style it will be possible to ask for PU , at RefDB level, in method findAllCollections ) 
           if ( self.NeededdataTiers.count('Digi') ):
             PUCollID=self.getDatatierCollID(url[:end+1],Collections,"PU")
             if (PUCollID) :
               if CollIDs.count(PUCollID)<=0:
                CollIDs.append(PUCollID)
           ##
           Collections=string.join(CollIDs,'-')
           ### download from PubDB all the info about the given collections
           pubdb_analysis=PubDBInfo(url[:end+1],Collections)
           #print pubdb_analysis.GetPubDBInfo()
           ok=0
           try:
             catInfos=pubdb_analysis.GetPubDBInfo()
             ok=1
           except :
             #print "WARNING: can't get PubDB content out of "+url[:end+1]+"\n"
             print '\nERROR extracting info for collections '+Collections+' from PubDB '+url[:end+1]+'.'
             print '>>>> Ask for help reporting that the failing PubDB script is: \n>>>> '+url[:end+1]+'pubdb-get-analysisinfo.php?collid='+Collections
             #raise PubDBGetAnalysisError(url[:end+1],Collections)   
           if (ok): result=catInfos;

        else:

### before PubDB V4 : get info for each collection and read the key-value pair text
              
          for CollID in CollIDs:
            urlphp=url[:end+1]+self.PubDBAnalysisPhp_+'?CollID='+CollID
            # print 'PHP URL: '+urlphp+' \n'

            reOld=re.compile( r'V24' )
            #print urlphp,'Old PubDB ',reOld.search(urlphp)
            if reOld.search(urlphp):
                raise NoPHPError(urlphp)
            else:
                try:
                    f = urllib2.urlopen(urlphp) 
                except urllib2.URLError, msg:
                    print "WARNING: ", msg 
                    raise PubDBError(urlphp)
                except urllib2.HTTPError, msg:
                    print "WARNING: ", msg
                    raise NoPHPError(urlphp)
                content = f.read()
                result.append(pubDBResult(content))
                #print "Coll",CollID," content ",content
                pass
            pass
        
        #print '.....'
        #for r in result:
        #     r.dump()
        #print '.....'
        return result

########################################################################
    def getDatatierCollID(self,urlbase,CollIDString,datatier):
        """
        Contact a script of PubDB to retrieve the collid a DataTier
        """
        try:
          f = urllib.urlopen(urlbase+'pubdb-get-collidbydatatier.php?collid='+CollIDString+"&datatier="+datatier)
        except IOError:
          raise PubDBGetAnalysisError(url[:end+1]+'pubdb-get-collidbydatatier.php',CollIDString)
        data = f.read()
        colldata=re.compile(r'collid=(\S*)').search(data);
        if colldata:
           datatier_CollID=colldata.group(1)
#           print " --> asking to PubDB "+urlbase+" for an additional collection : ID= "+datatier_CollID+" DataTier= "+datatier
           common.logger.message(" --> asking to PubDB "+urlbase+" for an additional collection : ID= "+datatier_CollID+" DataTier= "+datatier)

           return datatier_CollID       
 
########################################################################
    def getAllPubDBData(self):
        """
         Contact a list of PubDB to collect all the relevant information
        """
        newPubDBResult=[]
        oldPubDBResult=[]
        Result={}

### find the user-required collection IDs 
        CollIDs = self.findAllCollections()
### find the PubDB URLs publishing the needed data 
        urllist = self.findPubDBs(CollIDs)
### collect information sparatelly from new-style PubDBs and old-style PubDBs
        for pubdburl in urllist: 
            end=string.rfind(pubdburl,'/')
            newversion=self.checkPubDBNewVersion(pubdburl[:end+1])
            if (newversion):
              res=self.getPubDBData(CollIDs,pubdburl,newversion)
              if len(res)>0:
               newPubDBResult.append(res)
            else:
              resold=self.getPubDBData(CollIDs,pubdburl,newversion)
              if len(resold)>0:
               oldPubDBResult.append(resold)
### fill a dictionary with all the PubBDs results both old-style and new-style
        Result['newPubDB']=newPubDBResult
        Result['oldPubDB']=oldPubDBResult

        ## print for debugging purpose
        #
        #for PubDBversion in Result.keys():
            #print ("key %s, val %s" %(PubDBversion,Result[PubDBversion]))
        #    if len(Result[PubDBversion])>0 :
               #print (" key %s"%(PubDBversion)) 
        #       for result in Result[PubDBversion]:
        #          for r in result:
                      #r.dump()
        #              common.log.write('----------------- \n')
              #print '.....................................'

        return Result

####################################################################
Revision:	1.11
Committed:	Wed Nov 16 13:28:54 2005 UTC (19 years, 5 months ago) by afanfani
Content type:	text/x-python
Branch:	MAIN
CVS Tags:	CRAB_1_0_7, CRAB_1_0_7_pre1, CRAB_1_0_6, CRAB_1_0_5, CRAB_1_0_4, CRAB_1_0_3
Changes since 1.10:	+12 -7 lines
Log Message:	slight improvement in the error messages fix to really skip CNAF in CEblacklist fix not to count several times the PU CollID
#	User	Rev	Content
1	slacapra	1.3	#!/usr/bin/env python2
2	nsmirnov	1.1	import sys, os, string, re
3			import urllib, urllister
4			import urllib2
5	nsmirnov	1.2	import common
6	slacapra	1.3	from RefDBInfo import RefDBInfo
7	slacapra	1.6	from PubDBInfo import *
8	nsmirnov	1.1
9			# ####################################
10	slacapra	1.3	class PubDBError:
11			def __init__(self, url):
12			print '\nERROR accessing PubDB at '+url+'\n'
13			pass
14	slacapra	1.5
15			# ####################################
16			class PubDBGetAnalysisError:
17			def __init__(self, url,Collections):
18			print '\nERROR extracting info for collections '+Collections+' from PubDB '+url+'.\n'
19			pass
20
21	slacapra	1.3	# ####################################
22			class RefDBmapError:
23			def __init__(self, url):
24			print '\nERROR accessing RefDB-PubDBs map at '+url+'\n'
25			pass
26	nsmirnov	1.1
27			# ####################################
28			class NoPHPError:
29	slacapra	1.3	def __init__(self, url):
30			#print '\nERROR accessing PHP at '+url+' \n'
31			print 'ERROR accessing PHP: ',url,' \n'
32			pass
33	nsmirnov	1.1
34			# ####################################
35			class pubDBResult:
36	slacapra	1.3	def __init__(self,
37			contents):
38			self.contents=contents
39	nsmirnov	1.1
40
41	slacapra	1.3	def dump(self):
42			print 'Contents : ',self.contents
43			pass
44	nsmirnov	1.1
45			# ####################################
46			# class to access PubDBs
47			class PubDB:
48	slacapra	1.3	def __init__(self, owner, dataset, dataTiers, cfg_params):
49
50			# Attributes
51			self.owner = owner
52			self.dataset = dataset
53			self.dataTiers = dataTiers
54	slacapra	1.5	self.NeededdataTiers=[]
55	slacapra	1.3	self.cfg_params = cfg_params
56	nsmirnov	1.1
57	slacapra	1.3	self.RefDBurl_ = 'http://cmsdoc.cern.ch/cms/production/www/'
58			self.RefDBphp_ = 'PubDB/GetIdCollection.php'
59			self.RefDBMotherphp_ = 'cgi/SQL/CollectionTree.php'
60	nsmirnov	1.1
61	slacapra	1.3	self.PubDBCentralUrl_ = 'http://cmsdoc.cern.ch/cms/production/www/PubDB/'
62			self.PubDBCentralPhp_ = 'GetPublishedCollectionInfoFromRefDB.php'
63	nsmirnov	1.1
64	slacapra	1.3	self.PubDBAnalysisPhp_ = 'get-pubdb-analysisinfo.php'
65			self.PubDBAnalysisPhpOld_ = 'get-pubdb-analysisinfo.php'
66	nsmirnov	1.1
67	slacapra	1.3	## link to the modified RefDB-PubDBs map script that allow the display option
68			self.RefDBPubDBsmapPhp_ = 'GetPublishedCollectionInfoFromRefDB.mod.php?display=1'
69	nsmirnov	1.1
70	slacapra	1.3	# Costructor procedures
71	nsmirnov	1.1
72	slacapra	1.3	CEBlackList = []
73			try:
74	slacapra	1.5	tmpBad = string.split(self.cfg_params['EDG.ce_black_list'],',')
75	slacapra	1.3	#tmpBad = ['fnal']
76			for tmp in tmpBad:
77			tmp=string.strip(tmp)
78	afanfani	1.11	if (tmp == 'cnaf'): tmp = 'webserver' ########## warning: temp. patch
79	slacapra	1.3	CEBlackList.append(tmp)
80			except KeyError:
81			pass
82	slacapra	1.7
83			CEWhiteList = []
84			try:
85			tmpGood = string.split(self.cfg_params['EDG.ce_white_list'],',')
86			#tmpGood = ['cern']
87			for tmp in tmpGood:
88			tmp=string.strip(tmp)
89			#if (tmp == 'cnaf'): tmp = 'webserver' ########## warning: temp. patch
90			CEWhiteList.append(tmp)
91			except KeyError:
92			pass
93
94			#print 'CEWhiteList: ',CEWhiteList
95			self.reCEWhiteList=[]
96			for Good in CEWhiteList:
97			self.reCEWhiteList.append(re.compile( Good ))
98			#print 'ReGood: ',self.reCEWhiteList
99
100	slacapra	1.5	common.logger.debug(5,'CEBlackList: '+str(CEBlackList))
101	slacapra	1.7	common.logger.debug(5,'CEWhiteList: '+str(CEWhiteList))
102	slacapra	1.3	self.reCEBlackList=[]
103			for bad in CEBlackList:
104			self.reCEBlackList.append(re.compile( bad ))
105			#print 'ReBad: ',self.reCEBlackList
106	nsmirnov	1.1
107
108			########################################################################
109	slacapra	1.3	def findAllCollections(self):
110			"""
111			Contact RefDB and find the CollID of all the user required collections
112			"""
113			## download from RefDB all the info about the given dataset-owner
114			refdb=RefDBInfo(self.owner,self.dataset)
115			#print refdb.GetRefDBInfo()
116			try:
117			collInfos=refdb.GetRefDBInfo()
118			except :
119			sys.exit(10)
120			#print "collInfos=", collInfos
121
122			first=1
123			NeededCollID=[]
124			refdbdataTiers=[]
125			for coll in collInfos:
126			## select the primary collection
127			if first:
128			NeededCollID.append(coll[0])
129	slacapra	1.5	self.NeededdataTiers.append(coll[2])
130	slacapra	1.3	refdbdataTiers.append(coll[2])
131			common.logger.message("\n --> primary collection for owner "+self.owner+" is: ID="+coll[0]+" DataTier="+coll[2])
132			first=0
133			else:
134			## select only the parents collections corresponding to data-tiers requested by the user
135			if self.dataTiers.count(coll[2]):
136			NeededCollID.append(coll[0])
137	slacapra	1.5	self.NeededdataTiers.append(coll[2])
138	slacapra	1.3	common.logger.message(" --> further collection required: ID="+coll[0]+" DataTier="+coll[2])
139			refdbdataTiers.append(coll[2])
140
141			## check that the user asks for Data Tier really existing in RefDB, otherwise give a warning message
142			for dt in self.dataTiers:
143			if refdbdataTiers.count(dt)<=0:
144	slacapra	1.5	msg = "ERROR: Data Tier ( =>",dt,"<= ) not existing for dataset/owner "+self.dataset+"/"+self.owner+"! "
145			msg = str(msg) + 'Owner Dataset not published with asked dataTiers! '+\
146	slacapra	1.10	self.owner+' '+ self.dataset+' '+str(self.dataTiers)+'\n'
147	slacapra	1.5	msg = str(msg) + ' Check the data_tier variable in crab.cfg !\n'
148			common.logger.message(msg)
149	slacapra	1.3	return []
150
151			#print 'Needed Collections are ', NeededCollID
152			#return collInfos
153			#print "NeededCollID= ", NeededCollID
154			return NeededCollID
155
156	nsmirnov	1.1	########################################################################
157	slacapra	1.3	def findPubDBsbyCollID(self,CollID):
158			"""
159			Find the list of PubDB URLs having a given Collection
160			"""
161			### contact the RefDB-PubDBs map to discovery where the given CollID is
162			url = self.PubDBCentralUrl_+self.RefDBPubDBsmapPhp_+'&CollID=' + CollID
163			# print "%s"%(url)
164			try:
165			f = urllib.urlopen(url)
166			except IOError:
167			# print 'Cannot access URL: '+url
168			raise RefDBmapError(url)
169
170			### search for the PubDBURL string
171			reURLLine=re.compile( r'PubDBURL=(\S*)' )
172
173			PubDBURLs = []
174			for line in f.readlines():
175			#print '#',line,'#'
176			if reURLLine.search(line) :
177			URLLine=reURLLine.search(line).group()
178			#print string.split(URLLine,'=')[1]
179			PubDBURLs.append(string.split(URLLine,'=')[1])
180	nsmirnov	1.1
181	slacapra	1.3	### return the list of PubDBURL where the collection is present
182			#return PubDBURLs
183			return self.uniquelist(PubDBURLs)
184
185			################################################################
186			def findPubDBs(self,CollIDs):
187			"""
188			Find the list of PubDB URLs having ALL the required collections
189			"""
190			### loop over all the required collections
191			#pubdbmap={}
192			allurls=[]
193			countColl=0
194			for CollID in CollIDs :
195			countColl=countColl+1
196			### map the CollectionID with the list of PubDB URLs
197			#pubdbmap[CollID]=self.findPubDBsbyCollID(CollID)
198			### prepare a list all PubDB urls for all collections
199			allurls.extend(self.findPubDBsbyCollID(CollID))
200			#print pubdbmap.values()
201
202			### select only PubDB urls that contains all the collections
203			unique_urls=self.uniquelist(allurls)
204			SelectedPubDBURLs=[]
205			# loop on a unique list of PubDB urls
206			for url in unique_urls :
207			# check that PubDBurl occurrance is the same as the number of collections
208			if ( allurls.count(url)==countColl ) :
209			SelectedPubDBURLs.append(url)
210	slacapra	1.7	common.logger.debug(5,'PubDBs '+str(SelectedPubDBURLs))
211	slacapra	1.3
212			#print 'Required Collections',CollIDs,'are all present in PubDBURLs : ',SelectedPubDBURLs,'\n'
213	slacapra	1.7	#### check based on CE black list: select only PubDB not in the CE black list
214			tmp=self.checkBlackList(SelectedPubDBURLs)
215			common.logger.debug(5,'PubDBs after black list '+str(tmp))
216
217			### check based on CE white list: select only PubDB defined by user
218			GoodPubDBURLs=self.checkWhiteList(tmp)
219	afanfani	1.11	if len(GoodPubDBURLs)>0 :
220			common.logger.debug(5,'PubDBs after white list '+str(GoodPubDBURLs))
221			common.logger.debug(3,'Selected sites via PubDB URLs are '+str(GoodPubDBURLs))
222	slacapra	1.3	return GoodPubDBURLs
223
224			#######################################################################
225			def uniquelist(self, old):
226			"""
227	slacapra	1.7	remove duplicates from a list
228	slacapra	1.3	"""
229			nd={}
230			for e in old:
231			nd[e]=0
232			return nd.keys()
233	nsmirnov	1.1
234	slacapra	1.3	#######################################################################
235	slacapra	1.7	def checkWhiteList(self, pubDBUrls):
236			"""
237			select PubDB URLs that are at site defined by the user (via CE white list)
238			"""
239	slacapra	1.8	if len(self.reCEWhiteList)==0: return pubDBUrls
240	slacapra	1.7	goodurls = []
241			for url in pubDBUrls:
242			#print 'connecting to the URL ',url
243			good=0
244			for re in self.reCEWhiteList:
245			if re.search(url):
246			common.logger.debug(5,'CE in white list, adding PubDB URL '+url)
247			good=1
248			if not good: continue
249			goodurls.append(url)
250			if len(goodurls) == 0:
251	afanfani	1.11	common.logger.message("No sites found via PubDB \n")
252	slacapra	1.7	else:
253	afanfani	1.11	common.logger.debug(5,"Selected sites via PubDB URLs are "+str(goodurls)+"\n")
254	slacapra	1.7	return goodurls
255
256			#######################################################################
257	slacapra	1.3	def checkBlackList(self, pubDBUrls):
258			"""
259	slacapra	1.5	select PubDB URLs that are at site not exluded by the user (via CE black list)
260	slacapra	1.3	"""
261	slacapra	1.8	if len(self.reCEBlackList)==0: return pubDBUrls
262	slacapra	1.3	goodurls = []
263			for url in pubDBUrls:
264	slacapra	1.5	common.logger.debug(10,'connecting to the URL '+url)
265	slacapra	1.3	good=1
266			for re in self.reCEBlackList:
267			if re.search(url):
268	slacapra	1.4	common.logger.message('CE in black list, skipping PubDB URL '+url)
269			good=0
270			pass
271			if good: goodurls.append(url)
272	slacapra	1.3	if len(goodurls) == 0:
273	afanfani	1.11	common.logger.debug(3,"No sites found via PubDB")
274	slacapra	1.3	return goodurls
275	slacapra	1.5
276			########################################################################
277			def checkPubDBNewVersion(self, baseurl):
278			"""
279			Check PubDB version to find out if it's new-style or old-style
280			"""
281			### check based on the existance of pubdb-get-version.php
282			urlversion=baseurl+'pubdb-get-version.php'
283			newversion=1;
284			try:
285			v = urllib2.urlopen(urlversion)
286			except urllib2.URLError, msg:
287			#print "WARNING: no URL to get PubDB version "
288			newversion=0;
289
290			if (newversion) :
291			schemaversion = v.read()
292			#print schemaversion;
293
294			return newversion
295
296	slacapra	1.3	########################################################################
297	slacapra	1.5	def getPubDBData(self, CollIDs, url , newversion):
298	slacapra	1.3	"""
299			Contact a PubDB to collect all the relevant information
300			"""
301			result = []
302	slacapra	1.5
303			### get the base PubDb url
304			end=string.rfind(url,'/')
305			lastEq=string.rfind(url,'=')
306
307			if (newversion) :
308			### from PubDB V4 : get info for all the collections in one shot and unserialize the content
309			Collections=string.join(CollIDs,'-')
310			## add the PU among the required Collections if the Digi are requested
311			# ( for the time being asking it directly to the PubDB so the RefDB
312			# level data discovery is bypassed..... in future when every site
313			# will have the new style it will be possible to ask for PU , at RefDB level, in method findAllCollections )
314			if ( self.NeededdataTiers.count('Digi') ):
315			PUCollID=self.getDatatierCollID(url[:end+1],Collections,"PU")
316	afanfani	1.11	if (PUCollID) :
317			if CollIDs.count(PUCollID)<=0:
318			CollIDs.append(PUCollID)
319	slacapra	1.5	##
320			Collections=string.join(CollIDs,'-')
321			### download from PubDB all the info about the given collections
322			pubdb_analysis=PubDBInfo(url[:end+1],Collections)
323			#print pubdb_analysis.GetPubDBInfo()
324			ok=0
325			try:
326			catInfos=pubdb_analysis.GetPubDBInfo()
327			ok=1
328			except :
329			#print "WARNING: can't get PubDB content out of "+url[:end+1]+"\n"
330	afanfani	1.11	print '\nERROR extracting info for collections '+Collections+' from PubDB '+url[:end+1]+'.'
331			print '>>>> Ask for help reporting that the failing PubDB script is: \n>>>> '+url[:end+1]+'pubdb-get-analysisinfo.php?collid='+Collections
332	slacapra	1.5	#raise PubDBGetAnalysisError(url[:end+1],Collections)
333			if (ok): result=catInfos;
334
335			else:
336
337			### before PubDB V4 : get info for each collection and read the key-value pair text
338
339			for CollID in CollIDs:
340	slacapra	1.3	urlphp=url[:end+1]+self.PubDBAnalysisPhp_+'?CollID='+CollID
341			# print 'PHP URL: '+urlphp+' \n'
342	slacapra	1.5
343	slacapra	1.3	reOld=re.compile( r'V24' )
344			#print urlphp,'Old PubDB ',reOld.search(urlphp)
345			if reOld.search(urlphp):
346			raise NoPHPError(urlphp)
347			else:
348			try:
349			f = urllib2.urlopen(urlphp)
350			except urllib2.URLError, msg:
351			print "WARNING: ", msg
352			raise PubDBError(urlphp)
353			except urllib2.HTTPError, msg:
354			print "WARNING: ", msg
355			raise NoPHPError(urlphp)
356			content = f.read()
357			result.append(pubDBResult(content))
358			#print "Coll",CollID," content ",content
359			pass
360			pass
361
362			#print '.....'
363			#for r in result:
364			# r.dump()
365			#print '.....'
366			return result
367	slacapra	1.5
368			########################################################################
369			def getDatatierCollID(self,urlbase,CollIDString,datatier):
370			"""
371			Contact a script of PubDB to retrieve the collid a DataTier
372			"""
373			try:
374			f = urllib.urlopen(urlbase+'pubdb-get-collidbydatatier.php?collid='+CollIDString+"&datatier="+datatier)
375			except IOError:
376			raise PubDBGetAnalysisError(url[:end+1]+'pubdb-get-collidbydatatier.php',CollIDString)
377			data = f.read()
378			colldata=re.compile(r'collid=(\S*)').search(data);
379			if colldata:
380			datatier_CollID=colldata.group(1)
381			# print " --> asking to PubDB "+urlbase+" for an additional collection : ID= "+datatier_CollID+" DataTier= "+datatier
382			common.logger.message(" --> asking to PubDB "+urlbase+" for an additional collection : ID= "+datatier_CollID+" DataTier= "+datatier)
383
384			return datatier_CollID
385
386	slacapra	1.3	########################################################################
387	slacapra	1.5	def getAllPubDBData(self):
388	slacapra	1.3	"""
389			Contact a list of PubDB to collect all the relevant information
390			"""
391	slacapra	1.5	newPubDBResult=[]
392			oldPubDBResult=[]
393			Result={}
394
395			### find the user-required collection IDs
396			CollIDs = self.findAllCollections()
397			### find the PubDB URLs publishing the needed data
398			urllist = self.findPubDBs(CollIDs)
399			### collect information sparatelly from new-style PubDBs and old-style PubDBs
400	slacapra	1.3	for pubdburl in urllist:
401	slacapra	1.5	end=string.rfind(pubdburl,'/')
402			newversion=self.checkPubDBNewVersion(pubdburl[:end+1])
403			if (newversion):
404			res=self.getPubDBData(CollIDs,pubdburl,newversion)
405			if len(res)>0:
406			newPubDBResult.append(res)
407			else:
408			resold=self.getPubDBData(CollIDs,pubdburl,newversion)
409			if len(resold)>0:
410			oldPubDBResult.append(resold)
411			### fill a dictionary with all the PubBDs results both old-style and new-style
412			Result['newPubDB']=newPubDBResult
413			Result['oldPubDB']=oldPubDBResult
414
415	slacapra	1.3	## print for debugging purpose
416	slacapra	1.5	#
417			#for PubDBversion in Result.keys():
418			#print ("key %s, val %s" %(PubDBversion,Result[PubDBversion]))
419			# if len(Result[PubDBversion])>0 :
420			#print (" key %s"%(PubDBversion))
421			# for result in Result[PubDBversion]:
422			# for r in result:
423			#r.dump()
424			# common.log.write('----------------- \n')
425			#print '.....................................'
426
427			return Result
428
429	slacapra	1.3	####################################################################