Processing/bin/findSamples.py

#!/usr/bin/env python
#---------------------------------------------------------------------------------------------------
# Script to identify all samples which are kept in our database.
#
# Author: C.Paus                                                                (September 23, 2008)
#---------------------------------------------------------------------------------------------------
import os,sys,getopt,re,string

def findStartedDatasets(path):
    print " Collecting information over started samples"
    datasetList = []

    cmd = 'list ' + path
    for line in os.popen(cmd).readlines():   # run command
        line = line[:-1]                     # strip '\n'
        f    = line.split(" ")
        size = f[0]
        file = f[1]

        #if debug == 1:
        #    print ' Debug:: adding: ' + file + ' with size ' + size
        datasetList.append(file)

    return datasetList

def findOngoingDatasets(path):
    print " Collecting information over ongoing samples"
    datasetList = []

    cmd = 'cat crab_[0-9]_[0-9]*_[0-9]*/share/crab.cfg |grep ^user_remote_dir'
    for line in os.popen(cmd).readlines():   # run command
        line    = line[:-1]                  # strip '\n'
        f       = line.split("/")
        dataset = f[-1]
        if re.search('crab_0',dataset):
            dataset = f[-2]
        
        #if debug == 1:
        #    print ' Debug:: adding: ' + dataset '
        datasetList.append(dataset)

    return datasetList

def findCompletedDatasets(path):
    print " Collecting information over completed samples"
    datasetList = []

    cmd = 'cat ' + mitCfg + '/' + version + '/Completed  2> /dev/null'
    for line in os.popen(cmd).readlines():   # run command
        line    = line[:-1]                  # strip '\n'
        dataset = line

        #if debug == 1:
        #    print ' Debug:: adding: ' + dataset '
        datasetList.append(dataset)

    return datasetList

def inList(file,list):
    for entry in list:
        if entry == file:
            return True
    return False

def cleanupCompletedList(ongoingDsetList,completedDsetList):
    print " Update completed list with ongoing list"

    for dataset in ongoingDsetList:
        if inList(dataset,completedDsetList):
            print ' -> removing fropm completed: ' + dataset
            cmd  = 'cat ' + mitCfg + '/' + version + '/Completed|grep -v ^' + dataset + '$ > C.bak'
            cmd += '; mv C.bak ' + mitCfg + '/' + version + '/Completed'
            os.system(cmd)

#===================================================================================================
# Main starts here
#===================================================================================================
# Define string to explain usage of the script
usage  = "\nUsage: findSamples.py --mitCfg=<name>\n"
usage += "                      --version=<version>\n"
usage += "                      --cmssw=<name>\n"
usage += "                      --exe\n"
usage += "                      --noInfo\n"
usage += "                      --noDownload\n"
usage += "                      --forceCopy\n"
usage += "                      --debug\n"
usage += "                      --help\n\n"

# Define the valid options which can be specified and check out the command line
valid = ['mitCfg=','version=','cmssw=','help','exe','noInfo','noDownload','forceCopy','debug']
try:
    opts, args = getopt.getopt(sys.argv[1:], "", valid)
except getopt.GetoptError, ex:
    print usage
    print str(ex)
    sys.exit(1)

# --------------------------------------------------------------------------------------------------
# Get all parameters for the production
# --------------------------------------------------------------------------------------------------
# Set defaults for each option
mitCfg     = 'filefi'
version    = '013'
cmssw      = ''
cmsswCfg   = 'cmssw.cfg'
exe        = 0
noInfo     = False
noDownload = False
forceCopy  = False
debug      = False

# Read new values from the command line
for opt, arg in opts:
    if opt == "--help":
        print usage
        sys.exit(0)
    if opt == "--mitCfg":
        mitCfg     = arg
    if opt == "--version":
        version    = arg
    if opt == "--cmssw":
        cmssw      = arg
    if opt == "--exe":
        exe        = 1
    if opt == "--noInfo":
        noInfo     = True
    if opt == "--noDownload":
        noDownload = True
    if opt == "--forceCopy":
        forceCopy  = True
    if opt == "--debug":
        debug      = True

# Read parameters needed
crabFile  = mitCfg + '/' + version + '/' + 'crab.cfg'
if not os.path.exists(crabFile):
    cmd = "Crab file not found: %s" % crabFile
    raise RuntimeError, cmd
cmsswFile = mitCfg + '/' + version + '/' + cmsswCfg
if not os.path.exists(cmsswFile):
    cmd = "Cmssw file not found: %s" % cmsswFile
    cmsswCfg = 'cmssw.py'
    cmsswFile = mitCfg + '/' + version + '/' + cmsswCfg
    if not os.path.exists(cmsswFile):
        cmd = "Cmssw file not found: %s" % cmsswFile
        cmd = " XXXX ERROR no valid configuration found XXXX"
        raise RuntimeError, cmd

# Find all started samples
path = '/pnfs/cmsaf.mit.edu/t2bat/cms/store/user/paus/' + mitCfg + '/' + version
cmd  = 'grep ^storage_element ' + crabFile + '| grep cern.ch'
for line in os.popen(cmd).readlines():
    path = '/castor/cern.ch/user/p/paus/' + mitCfg + '/' + version

startedDsetList = findStartedDatasets(path)
#print " Dataset list: "
#for dataset in startedDsetList:
#    print ' -> ' + dataset

ongoingDsetList = findOngoingDatasets(path)
completedDsetList = findCompletedDatasets(path)
cleanupCompletedList(ongoingDsetList,completedDsetList)
completedDsetList = findCompletedDatasets(path)

# Resolve the other mitCfg parameters from the configuration file
cmd = 'cat ' + mitCfg + '/' + version + '/' + 'Productions'
if cmssw != '':
    cmd = cmd + '.' + cmssw


print ''
join       = 0
mitDataset = ""
fullLine   = ""
bSlash     = "\\";
for line in os.popen(cmd).readlines():  # run command
    line = line[:-1]
    #print 'Line: "' + line + '"'
    # get ride of empty lines
    if line == '':
        continue
    # get ride of commented lines and read steering parameters
    if line[0] == '#':
        names = line.split()       # splitting every blank
        if len(names)> 2 and names[1] == 'crontab' and int(names[3]) != 1:
            print 'No crontab has been set.... (LINE: ' + line + ')'
            sys.exit(0)
        continue

    # join lines
    if join == 1:
        fullLine += line
    else:
        fullLine  = line

    # determine if finished or more is coming
    if fullLine[-1] == bSlash:
        join = 1
        fullLine = fullLine[:-1]
    else:
        join = 0
        fullLine = " ".join(str(fullLine).split()).strip()
        # test whether there is a directory   
        names      = fullLine.split()       # splitting every blank
        if debug == True:
            print "FullLine: " + fullLine
        cmsDataset = names[0]
        mitDataset = names[1]               # this is the equivalent MIT name of the dataset
        nevents    = int(names[2])          # number of events to be used in the production
        procStatus = names[3]
        local      = names[4]
        
        cmd = 'submit.py --mitDataset=' + mitDataset + ' --mitCfg=' + mitCfg + \
              ' --version=' + version + ' --noTestJob'
        if cmssw != '':
            cmd = cmd + " --cmssw=" + cmssw

        # check for errors (to be done)

        # check for the logical combinations
        if   not inList(mitDataset,startedDsetList):
            
            #print ' new: ' + mitDataset
            print ' submitting: ' + cmd
            if exe == 1:
                os.system(cmd)

        elif     inList(mitDataset,ongoingDsetList):

            #print ' sub: ' + mitDataset
            print ' handled by jobSitter -- ' + mitDataset

        elif     inList(mitDataset,completedDsetList):
            if not noInfo:
                print ' don: ' + mitDataset
        else:

            cmd = cmd + ' --complete'
            #print ' toc: ' + mitDataset
            print ' completing: ' + cmd
            if exe == 1:
                os.system(cmd)

        # test download request
        if local != "-" and not noDownload:
             localPath  = local
             cmd = 'downloadSample.py --cmsDataset=' + cmsDataset + ' --mitCfg=' + mitCfg + \
                   " --version=" + version
             if cmssw != '':
                 cmd = cmd + " --cmssw=" + cmssw
             if forceCopy:
                 cmd += ' --forceCopy'
             print " " + cmd
             if exe == 1:
                 status = os.system(cmd)

##         if   procStatus == "new":
##             print " " + cmd
##             if exe == 1:
##                 status = os.system(cmd)
##         elif procStatus == "com":
##             cmd = cmd + ' --noTestJob --complete'
##             print " " + cmd
##             if exe == 1:
##                 status = os.system(cmd)
##         elif local != "-" and not noDownload:
##             localPath  = local
##             cmd = 'downloadSample.py --cmsDataset=' + cmsDataset + ' --mitCfg=' + mitCfg + \
##                   " --version=" + version
##             if forceCopy:
##                 cmd += ' --forceCopy'
##             print " " + cmd
##             if exe == 1:
##                 status = os.system(cmd)
##         else:
##             if not noInfo:
##                 print " Sample Info: " + fullLine
            
if mitDataset == "":
    print "ERROR - dataset not defined."
    sys.exit(0)

sys.exit(0)

# Say what we do now
print ' Preparing dataset: ' + cmsDataset + ' [MIT: ' + mitDataset + ' with ' + str(nevents) + \
      ' per job]'
Revision:	1.1.2.1
Committed:	Sat Jun 5 01:49:21 2010 UTC (14 years, 11 months ago) by paus
Content type:	text/x-python
Branch:	Mit_013c
Changes since 1.1:	+287 -0 lines
Log Message:	first reasonably working production environment
#	User	Rev	Content
1	paus	1.1.2.1	#!/usr/bin/env python
2			#---------------------------------------------------------------------------------------------------
3			# Script to identify all samples which are kept in our database.
4			#
5			# Author: C.Paus (September 23, 2008)
6			#---------------------------------------------------------------------------------------------------
7			import os,sys,getopt,re,string
8
9			def findStartedDatasets(path):
10			print " Collecting information over started samples"
11			datasetList = []
12
13			cmd = 'list ' + path
14			for line in os.popen(cmd).readlines(): # run command
15			line = line[:-1] # strip '\n'
16			f = line.split(" ")
17			size = f[0]
18			file = f[1]
19
20			#if debug == 1:
21			# print ' Debug:: adding: ' + file + ' with size ' + size
22			datasetList.append(file)
23
24			return datasetList
25
26			def findOngoingDatasets(path):
27			print " Collecting information over ongoing samples"
28			datasetList = []
29
30			cmd = 'cat crab_[0-9]_[0-9]_[0-9]/share/crab.cfg \|grep ^user_remote_dir'
31			for line in os.popen(cmd).readlines(): # run command
32			line = line[:-1] # strip '\n'
33			f = line.split("/")
34			dataset = f[-1]
35			if re.search('crab_0',dataset):
36			dataset = f[-2]
37
38			#if debug == 1:
39			# print ' Debug:: adding: ' + dataset '
40			datasetList.append(dataset)
41
42			return datasetList
43
44			def findCompletedDatasets(path):
45			print " Collecting information over completed samples"
46			datasetList = []
47
48			cmd = 'cat ' + mitCfg + '/' + version + '/Completed 2> /dev/null'
49			for line in os.popen(cmd).readlines(): # run command
50			line = line[:-1] # strip '\n'
51			dataset = line
52
53			#if debug == 1:
54			# print ' Debug:: adding: ' + dataset '
55			datasetList.append(dataset)
56
57			return datasetList
58
59			def inList(file,list):
60			for entry in list:
61			if entry == file:
62			return True
63			return False
64
65			def cleanupCompletedList(ongoingDsetList,completedDsetList):
66			print " Update completed list with ongoing list"
67
68			for dataset in ongoingDsetList:
69			if inList(dataset,completedDsetList):
70			print ' -> removing fropm completed: ' + dataset
71			cmd = 'cat ' + mitCfg + '/' + version + '/Completed\|grep -v ^' + dataset + '$ > C.bak'
72			cmd += '; mv C.bak ' + mitCfg + '/' + version + '/Completed'
73			os.system(cmd)
74
75			#===================================================================================================
76			# Main starts here
77			#===================================================================================================
78			# Define string to explain usage of the script
79			usage = "\nUsage: findSamples.py --mitCfg=<name>\n"
80			usage += " --version=<version>\n"
81			usage += " --cmssw=<name>\n"
82			usage += " --exe\n"
83			usage += " --noInfo\n"
84			usage += " --noDownload\n"
85			usage += " --forceCopy\n"
86			usage += " --debug\n"
87			usage += " --help\n\n"
88
89			# Define the valid options which can be specified and check out the command line
90			valid = ['mitCfg=','version=','cmssw=','help','exe','noInfo','noDownload','forceCopy','debug']
91			try:
92			opts, args = getopt.getopt(sys.argv[1:], "", valid)
93			except getopt.GetoptError, ex:
94			print usage
95			print str(ex)
96			sys.exit(1)
97
98			# --------------------------------------------------------------------------------------------------
99			# Get all parameters for the production
100			# --------------------------------------------------------------------------------------------------
101			# Set defaults for each option
102			mitCfg = 'filefi'
103			version = '013'
104			cmssw = ''
105			cmsswCfg = 'cmssw.cfg'
106			exe = 0
107			noInfo = False
108			noDownload = False
109			forceCopy = False
110			debug = False
111
112			# Read new values from the command line
113			for opt, arg in opts:
114			if opt == "--help":
115			print usage
116			sys.exit(0)
117			if opt == "--mitCfg":
118			mitCfg = arg
119			if opt == "--version":
120			version = arg
121			if opt == "--cmssw":
122			cmssw = arg
123			if opt == "--exe":
124			exe = 1
125			if opt == "--noInfo":
126			noInfo = True
127			if opt == "--noDownload":
128			noDownload = True
129			if opt == "--forceCopy":
130			forceCopy = True
131			if opt == "--debug":
132			debug = True
133
134			# Read parameters needed
135			crabFile = mitCfg + '/' + version + '/' + 'crab.cfg'
136			if not os.path.exists(crabFile):
137			cmd = "Crab file not found: %s" % crabFile
138			raise RuntimeError, cmd
139			cmsswFile = mitCfg + '/' + version + '/' + cmsswCfg
140			if not os.path.exists(cmsswFile):
141			cmd = "Cmssw file not found: %s" % cmsswFile
142			cmsswCfg = 'cmssw.py'
143			cmsswFile = mitCfg + '/' + version + '/' + cmsswCfg
144			if not os.path.exists(cmsswFile):
145			cmd = "Cmssw file not found: %s" % cmsswFile
146			cmd = " XXXX ERROR no valid configuration found XXXX"
147			raise RuntimeError, cmd
148
149			# Find all started samples
150			path = '/pnfs/cmsaf.mit.edu/t2bat/cms/store/user/paus/' + mitCfg + '/' + version
151			cmd = 'grep ^storage_element ' + crabFile + '\| grep cern.ch'
152			for line in os.popen(cmd).readlines():
153			path = '/castor/cern.ch/user/p/paus/' + mitCfg + '/' + version
154
155			startedDsetList = findStartedDatasets(path)
156			#print " Dataset list: "
157			#for dataset in startedDsetList:
158			# print ' -> ' + dataset
159
160			ongoingDsetList = findOngoingDatasets(path)
161			completedDsetList = findCompletedDatasets(path)
162			cleanupCompletedList(ongoingDsetList,completedDsetList)
163			completedDsetList = findCompletedDatasets(path)
164
165			# Resolve the other mitCfg parameters from the configuration file
166			cmd = 'cat ' + mitCfg + '/' + version + '/' + 'Productions'
167			if cmssw != '':
168			cmd = cmd + '.' + cmssw
169
170
171			print ''
172			join = 0
173			mitDataset = ""
174			fullLine = ""
175			bSlash = "\\";
176			for line in os.popen(cmd).readlines(): # run command
177			line = line[:-1]
178			#print 'Line: "' + line + '"'
179			# get ride of empty lines
180			if line == '':
181			continue
182			# get ride of commented lines and read steering parameters
183			if line[0] == '#':
184			names = line.split() # splitting every blank
185			if len(names)> 2 and names[1] == 'crontab' and int(names[3]) != 1:
186			print 'No crontab has been set.... (LINE: ' + line + ')'
187			sys.exit(0)
188			continue
189
190			# join lines
191			if join == 1:
192			fullLine += line
193			else:
194			fullLine = line
195
196			# determine if finished or more is coming
197			if fullLine[-1] == bSlash:
198			join = 1
199			fullLine = fullLine[:-1]
200			else:
201			join = 0
202			fullLine = " ".join(str(fullLine).split()).strip()
203			# test whether there is a directory
204			names = fullLine.split() # splitting every blank
205			if debug == True:
206			print "FullLine: " + fullLine
207			cmsDataset = names[0]
208			mitDataset = names[1] # this is the equivalent MIT name of the dataset
209			nevents = int(names[2]) # number of events to be used in the production
210			procStatus = names[3]
211			local = names[4]
212
213			cmd = 'submit.py --mitDataset=' + mitDataset + ' --mitCfg=' + mitCfg + \
214			' --version=' + version + ' --noTestJob'
215			if cmssw != '':
216			cmd = cmd + " --cmssw=" + cmssw
217
218			# check for errors (to be done)
219
220			# check for the logical combinations
221			if not inList(mitDataset,startedDsetList):
222
223			#print ' new: ' + mitDataset
224			print ' submitting: ' + cmd
225			if exe == 1:
226			os.system(cmd)
227
228			elif inList(mitDataset,ongoingDsetList):
229
230			#print ' sub: ' + mitDataset
231			print ' handled by jobSitter -- ' + mitDataset
232
233			elif inList(mitDataset,completedDsetList):
234			if not noInfo:
235			print ' don: ' + mitDataset
236			else:
237
238			cmd = cmd + ' --complete'
239			#print ' toc: ' + mitDataset
240			print ' completing: ' + cmd
241			if exe == 1:
242			os.system(cmd)
243
244			# test download request
245			if local != "-" and not noDownload:
246			localPath = local
247			cmd = 'downloadSample.py --cmsDataset=' + cmsDataset + ' --mitCfg=' + mitCfg + \
248			" --version=" + version
249			if cmssw != '':
250			cmd = cmd + " --cmssw=" + cmssw
251			if forceCopy:
252			cmd += ' --forceCopy'
253			print " " + cmd
254			if exe == 1:
255			status = os.system(cmd)
256
257			## if procStatus == "new":
258			## print " " + cmd
259			## if exe == 1:
260			## status = os.system(cmd)
261			## elif procStatus == "com":
262			## cmd = cmd + ' --noTestJob --complete'
263			## print " " + cmd
264			## if exe == 1:
265			## status = os.system(cmd)
266			## elif local != "-" and not noDownload:
267			## localPath = local
268			## cmd = 'downloadSample.py --cmsDataset=' + cmsDataset + ' --mitCfg=' + mitCfg + \
269			## " --version=" + version
270			## if forceCopy:
271			## cmd += ' --forceCopy'
272			## print " " + cmd
273			## if exe == 1:
274			## status = os.system(cmd)
275			## else:
276			## if not noInfo:
277			## print " Sample Info: " + fullLine
278
279			if mitDataset == "":
280			print "ERROR - dataset not defined."
281			sys.exit(0)
282
283			sys.exit(0)
284
285			# Say what we do now
286			print ' Preparing dataset: ' + cmsDataset + ' [MIT: ' + mitDataset + ' with ' + str(nevents) + \
287			' per job]'