1 |
gutsche |
1.6 |
#!/usr/bin/env python
|
2 |
afanfani |
1.1 |
import sys, os, string, re
|
3 |
|
|
from DBSInfo import *
|
4 |
|
|
|
5 |
afanfani |
1.3 |
|
6 |
afanfani |
1.1 |
# ####################################
|
7 |
afanfani |
1.3 |
class DataDiscoveryError(exceptions.Exception):
|
8 |
|
|
def __init__(self, errorMessage):
|
9 |
|
|
args=errorMessage
|
10 |
|
|
exceptions.Exception.__init__(self, args)
|
11 |
|
|
pass
|
12 |
|
|
|
13 |
|
|
def getErrorMessage(self):
|
14 |
|
|
""" Return exception error """
|
15 |
|
|
return "%s" % (self.args)
|
16 |
|
|
|
17 |
afanfani |
1.1 |
# ####################################
|
18 |
afanfani |
1.3 |
class NotExistingDatasetError(exceptions.Exception):
|
19 |
|
|
def __init__(self, errorMessage):
|
20 |
|
|
args=errorMessage
|
21 |
|
|
exceptions.Exception.__init__(self, args)
|
22 |
|
|
pass
|
23 |
|
|
|
24 |
|
|
def getErrorMessage(self):
|
25 |
|
|
""" Return exception error """
|
26 |
|
|
return "%s" % (self.args)
|
27 |
afanfani |
1.1 |
|
28 |
|
|
# ####################################
|
29 |
afanfani |
1.3 |
class NoDataTierinProvenanceError(exceptions.Exception):
|
30 |
|
|
def __init__(self, errorMessage):
|
31 |
|
|
args=errorMessage
|
32 |
|
|
exceptions.Exception.__init__(self, args)
|
33 |
|
|
pass
|
34 |
|
|
|
35 |
|
|
def getErrorMessage(self):
|
36 |
|
|
""" Return exception error """
|
37 |
|
|
return "%s" % (self.args)
|
38 |
afanfani |
1.1 |
|
39 |
|
|
# ####################################
|
40 |
|
|
# class to find and extact info from published data
|
41 |
|
|
class DataDiscovery:
|
42 |
|
|
def __init__(self, owner, dataset, dataTiers, cfg_params):
|
43 |
|
|
|
44 |
|
|
# Attributes
|
45 |
afanfani |
1.4 |
self.owner = owner
|
46 |
|
|
self.dataset = dataset
|
47 |
afanfani |
1.1 |
self.dataTiers = dataTiers
|
48 |
|
|
self.cfg_params = cfg_params
|
49 |
|
|
|
50 |
afanfani |
1.3 |
self.dbspaths= [] # DBS output: list of dbspaths for all data
|
51 |
afanfani |
1.1 |
self.allblocks = [] # DBS output: list of map fileblocks-totevts for all dataset-owners
|
52 |
afanfani |
1.3 |
self.blocksinfo = {} # DBS output: map fileblocks-totevts for the primary block, used internally to this class
|
53 |
afanfani |
1.1 |
#DBS output: max events computed by method getMaxEvents
|
54 |
|
|
|
55 |
|
|
# ####################################
|
56 |
|
|
def fetchDBSInfo(self):
|
57 |
|
|
"""
|
58 |
|
|
Contact DBS
|
59 |
|
|
"""
|
60 |
|
|
|
61 |
afanfani |
1.3 |
## add the PU among the required data tiers if the Digi are requested
|
62 |
|
|
if (self.dataTiers.count('Digi')>0) & (self.dataTiers.count('PU')<=0) :
|
63 |
|
|
self.dataTiers.append('PU')
|
64 |
|
|
|
65 |
|
|
## get info about the requested dataset
|
66 |
afanfani |
1.4 |
dbs=DBSInfo()
|
67 |
afanfani |
1.5 |
try:
|
68 |
|
|
self.datasets = dbs.getMatchingDatasets(self.owner, self.dataset)
|
69 |
|
|
except DBSError, ex:
|
70 |
|
|
raise DataDiscoveryError(ex.getErrorMessage())
|
71 |
afanfani |
1.4 |
if len(self.datasets) == 0:
|
72 |
|
|
raise DataDiscoveryError("Owner=%s, Dataset=%s unknown to DBS" % (self.owner, self.dataset))
|
73 |
|
|
if len(self.datasets) > 1:
|
74 |
|
|
raise DataDiscoveryError("Owner=%s, Dataset=%s is ambiguous" % (self.owner, self.dataset))
|
75 |
afanfani |
1.1 |
try:
|
76 |
afanfani |
1.5 |
self.dbsdataset = self.datasets[0].get('datasetPathName')
|
77 |
afanfani |
1.4 |
self.blocksinfo = dbs.getDatasetContents(self.dbsdataset)
|
78 |
|
|
self.allblocks.append (self.blocksinfo.keys ()) # add also the current fileblocksinfo
|
79 |
|
|
self.dbspaths.append(self.dbsdataset)
|
80 |
afanfani |
1.3 |
except DBSError, ex:
|
81 |
|
|
raise DataDiscoveryError(ex.getErrorMessage())
|
82 |
|
|
|
83 |
|
|
if len(self.blocksinfo)<=0:
|
84 |
afanfani |
1.4 |
msg="\nERROR Data for %s do not exist in DBS! \n Check the dataset/owner variables in crab.cfg !"%self.dbsdataset
|
85 |
afanfani |
1.3 |
raise NotExistingDatasetError(msg)
|
86 |
|
|
|
87 |
|
|
|
88 |
|
|
## get info about the parents
|
89 |
afanfani |
1.1 |
try:
|
90 |
afanfani |
1.4 |
parents=dbs.getDatasetProvenance(self.dbsdataset, self.dataTiers)
|
91 |
afanfani |
1.3 |
except DBSInvalidDataTierError, ex:
|
92 |
|
|
msg=ex.getErrorMessage()+' \n Check the data_tier variable in crab.cfg !\n'
|
93 |
|
|
raise DataDiscoveryError(msg)
|
94 |
|
|
except DBSError, ex:
|
95 |
|
|
raise DataDiscoveryError(ex.getErrorMessage())
|
96 |
|
|
|
97 |
|
|
## check that the user asks for parent Data Tier really existing in the DBS provenance
|
98 |
afanfani |
1.4 |
self.checkParentDataTier(parents, self.dataTiers)
|
99 |
afanfani |
1.1 |
|
100 |
|
|
## for each parent get the corresponding fileblocks
|
101 |
afanfani |
1.4 |
try:
|
102 |
|
|
for p in parents:
|
103 |
|
|
## fill a list of dbspaths
|
104 |
afanfani |
1.5 |
parentPath = p.get('parent').get('datasetPathName')
|
105 |
afanfani |
1.4 |
self.dbspaths.append (parentPath)
|
106 |
|
|
parentBlocks = dbs.getDatasetContents (parentPath)
|
107 |
|
|
self.allblocks.append (parentBlocks.keys ()) # add parent fileblocksinfo
|
108 |
|
|
except DBSError, ex:
|
109 |
afanfani |
1.3 |
raise DataDiscoveryError(ex.getErrorMessage())
|
110 |
afanfani |
1.1 |
|
111 |
afanfani |
1.3 |
# #################################################
|
112 |
afanfani |
1.4 |
def checkParentDataTier(self, parents, dataTiers):
|
113 |
afanfani |
1.3 |
"""
|
114 |
|
|
check that the data tiers requested by the user really exists in the provenance of the given dataset
|
115 |
|
|
"""
|
116 |
afanfani |
1.4 |
startType = string.split(self.dbsdataset,'/')[2]
|
117 |
afanfani |
1.5 |
# for example 'type' is PU and 'dataTier' is Hit
|
118 |
|
|
parentTypes = map(lambda p: p.get('type'), parents)
|
119 |
afanfani |
1.4 |
for tier in dataTiers:
|
120 |
|
|
if parentTypes.count(tier) <= 0 and tier != startType:
|
121 |
|
|
msg="\nERROR Data %s not published in DBS with asked data tiers : the data tier not found is %s !\n Check the data_tier variable in crab.cfg !"%(self.dbsdataset,tier)
|
122 |
|
|
raise NoDataTierinProvenanceError(msg)
|
123 |
afanfani |
1.1 |
|
124 |
|
|
|
125 |
|
|
# #################################################
|
126 |
|
|
def getMaxEvents(self):
|
127 |
|
|
"""
|
128 |
|
|
max events of the primary dataset-owner
|
129 |
|
|
"""
|
130 |
|
|
## loop over the fileblocks of the primary dataset-owner
|
131 |
|
|
nevts=0
|
132 |
|
|
for blockevts in self.blocksinfo.values():
|
133 |
|
|
nevts=nevts+blockevts
|
134 |
|
|
|
135 |
|
|
return nevts
|
136 |
|
|
|
137 |
|
|
# #################################################
|
138 |
|
|
def getDBSPaths(self):
|
139 |
|
|
"""
|
140 |
|
|
list the DBSpaths for all required data
|
141 |
|
|
"""
|
142 |
|
|
return self.dbspaths
|
143 |
|
|
|
144 |
|
|
# #################################################
|
145 |
|
|
def getEVC(self):
|
146 |
|
|
"""
|
147 |
|
|
list the event collections structure by fileblock
|
148 |
|
|
"""
|
149 |
|
|
print "To be used by a more complex job splitting... TODO later... "
|
150 |
|
|
print "it requires changes in what's returned by DBSInfo.getDatasetContents and then fetchDBSInfo"
|
151 |
|
|
|
152 |
|
|
# #################################################
|
153 |
|
|
def getFileBlocks(self):
|
154 |
|
|
"""
|
155 |
|
|
fileblocks for all required dataset-owners
|
156 |
|
|
"""
|
157 |
|
|
return self.allblocks
|
158 |
|
|
|
159 |
|
|
########################################################################
|
160 |
|
|
|
161 |
|
|
|