Processing/bin/findSamples.py

#!/usr/bin/env python
#---------------------------------------------------------------------------------------------------
# Script to identify all samples which are kept in our database.
#
# Author: C.Paus                                                                (September 23, 2008)
#---------------------------------------------------------------------------------------------------
import os,sys,getopt,re,string

def findStartedDatasets(path):
    if debug == 1:
        print " Collecting information over started samples"
    datasetList = []

    cmd = 'list ' + path
    for line in os.popen(cmd).readlines():   # run command
        line = line[:-1]                     # strip '\n'
        f    = line.split(" ")
        size = f[0]
        file = f[1]

        #if debug == 1:
        #    print ' Debug:: adding: ' + file + ' with size ' + size
        datasetList.append(file)

    return datasetList

def findOngoingDatasets(path):
    if debug == 1:
        print " Collecting information over ongoing samples"
    datasetList = []

    cmd = 'cat crab_[0-9]_[0-9]*_[0-9]*/share/crab.cfg |grep ^user_remote_dir'
    for line in os.popen(cmd).readlines():   # run command
        line    = line[:-1]                  # strip '\n'
        f       = line.split("/")
        dataset = f[-1]
        if re.search('crab_0',dataset):
            dataset = f[-2]
        
        #if debug == 1:
        #    print ' Debug:: adding: ' + dataset '
        datasetList.append(dataset)

    return datasetList

def findCompletedDatasets(path):
    if debug == 1:
        print " Collecting information over completed samples"
    datasetList = []

    cmd = 'cat ' + os.environ['MIT_PROD_DIR'] + '/' + mitCfg + '/' + version + '/Completed  2> /dev/null'
    for line in os.popen(cmd).readlines():   # run command
        line    = line[:-1]                  # strip '\n'
        dataset = line

        #if debug == 1:
        #    print ' Debug:: adding: ' + dataset '
        datasetList.append(dataset)

    return datasetList

def inList(file,list):
    for entry in list:
        if entry == file:
            return True
    return False

def cleanupCompletedList(ongoingDsetList,completedDsetList):
    if debug == 1:
        print " Update completed list with ongoing list"

    for dataset in ongoingDsetList:
        if inList(dataset,completedDsetList):
            print ' -> removing fropm completed: ' + dataset
            cmd  = 'cat ' + mitCfg + '/' + version + '/Completed|grep -v ^' + dataset + '$ > C.bak'
            cmd += '; mv C.bak ' + mitCfg + '/' + version + '/Completed'
            os.system(cmd)

#===================================================================================================
# Main starts here
#===================================================================================================
# Define string to explain usage of the script
usage  = "\nUsage: findSamples.py --mitCfg=<name>\n"
usage += "                      --version=<version> [ default: MIT_VERS ]\n"
usage += "                      --cmssw=<name>\n"
usage += "                      --pattern=<name>\n"
usage += "                      --download=<int: -1,0,1>\n"
usage += "                      --status=<int: -1,0,1>\n"
usage += "                      --useExistingLfns\n"
usage += "                      --exe\n"
usage += "                      --noInfo\n"
usage += "                      --forceCopy\n"
usage += "                      --debug\n"
usage += "                      --help\n\n"

# Define the valid options which can be specified and check out the command line
valid = ['mitCfg=','version=','cmssw=','pattern=','download=','status=', \
         'help','exe','useExistingLfns','complete','noInfo','forceCopy','debug']
try:
    opts, args = getopt.getopt(sys.argv[1:], "", valid)
except getopt.GetoptError, ex:
    print usage
    print str(ex)
    sys.exit(1)

# --------------------------------------------------------------------------------------------------
# Get all parameters for the production
# --------------------------------------------------------------------------------------------------
# Set defaults for each option
mitCfg          = 'filefi'
version         = os.environ['MIT_VERS']
cmssw           = ''
pattern         = ''
cmsswCfg        = 'cmssw.cfg'
exe             = 0
useExistingLfns = False
complete        = 0
noInfo          = False
download        = -1
status          = -1
forceCopy       = False
debug           = False

# Read new values from the command line
for opt, arg in opts:
    if opt == "--help":
        print usage
        sys.exit(0)
    if opt == "--mitCfg":
        mitCfg          = arg
    if opt == "--version":
        version         = arg
    if opt == "--cmssw":
        cmssw           = arg
    if opt == "--pattern":
        pattern         = arg
    if opt == "--exe":
        exe             = 1
    if opt == "--useExistingLfns":
        useExistingLfns = True
    if opt == "--download":
        download        = int(arg)
    if opt == "--status":
        status          = int(arg)
    if opt == "--complete":
        complete        = 1
    if opt == "--noInfo":
        noInfo          = True
    if opt == "--forceCopy":
        forceCopy       = True
    if opt == "--debug":
        debug           = True

# Read parameters needed
crabFile = os.environ['MIT_PROD_DIR'] + '/' + mitCfg + '/' + version + '/' + 'crab.cfg'
if not os.path.exists(crabFile):
    cmd = "Crab file not found: %s" % crabFile
    raise RuntimeError, cmd
cmsswFile = os.environ['MIT_PROD_DIR'] + '/' + mitCfg + '/' + version + '/' + cmsswCfg
if not os.path.exists(cmsswFile):
    cmd = "Cmssw file not found: %s" % cmsswFile
    cmsswCfg = 'cmssw.py'
    cmsswFile = os.environ['MIT_PROD_DIR'] + '/' + mitCfg + '/' + version + '/' + cmsswCfg
    if not os.path.exists(cmsswFile):
        cmd = "Cmssw file not found: %s" % cmsswFile
        cmd = " XXXX ERROR no valid configuration found XXXX"
        raise RuntimeError, cmd

# Find all started samples
path = '/pnfs/cmsaf.mit.edu/t2bat/cms/store/user/paus/' + mitCfg + '/' + version
cmd  = 'grep ^storage_element ' + crabFile + '| grep cern.ch'
for line in os.popen(cmd).readlines():
    path = '/castor/cern.ch/user/p/paus/' + mitCfg + '/' + version

startedDsetList = findStartedDatasets(path)
#print " Dataset list: "
#for dataset in startedDsetList:
#    print ' -> ' + dataset

ongoingDsetList = findOngoingDatasets(path)
completedDsetList = findCompletedDatasets(path)
cleanupCompletedList(ongoingDsetList,completedDsetList)
completedDsetList = findCompletedDatasets(path)

# Resolve the other mitCfg parameters from the configuration file
cmd = 'cat '+ os.environ['MIT_PROD_DIR'] + '/' + mitCfg + '/' + version + '/' + 'Productions'
if cmssw != '':
    cmd = cmd + '.' + cmssw


print ''

join       = 0
mitDataset = ""
fullLine   = ""
bSlash     = "\\";
printOpt = "-header"
for line in os.popen(cmd).readlines():  # run command
    line = line[:-1]
    #print 'Line: "' + line + '"'
    # get ride of empty lines
    if line == '':
        continue
    # get ride of commented lines and read steering parameters
    if line[0] == '#':
        names = line.split()       # splitting every blank
        if len(names)> 2 and names[1] == 'crontab' and int(names[3]) != 1:
            print 'No crontab has been set.... (LINE: ' + line + ')'
            sys.exit(0)
        continue

    # join lines
    if join == 1:
        fullLine += line
    else:
        fullLine  = line

    # determine if finished or more is coming
    if fullLine[-1] == bSlash:
        join = 1
        fullLine = fullLine[:-1]
    else:
        join = 0
        fullLine = " ".join(str(fullLine).split()).strip()
        # test whether there is a directory   
        names      = fullLine.split()       # splitting every blank
        if debug == True:
            print "FullLine: " + fullLine
        cmsDataset = names[0]
        mitDataset = names[1]               # this is the equivalent MIT name of the dataset
        nevents    = int(names[2])          # number of events to be used in the production
        procStatus = names[3]
        local      = names[4]
        
        if pattern != '' and not re.search(pattern,mitDataset):
            continue

        # make sure we want to consider submission
        if download != 1 and status != 1:
            cmd = 'submit.py --mitDataset=' + mitDataset + ' --mitCfg=' + mitCfg + \
                  ' --version=' + version + ' --noTestJob'
            if cmssw != '':
                cmd = cmd + " --cmssw=" + cmssw
            if useExistingLfns:
                cmd = cmd + " --useExistingLfns"
    
            # check for errors (to be done)
    
            # check for the logical combinations
            if   not inList(mitDataset,startedDsetList):
                #print ' new: ' + mitDataset
                print ' submitting: ' + cmd
                if exe == 1:
                    os.system(cmd)
    
            elif     inList(mitDataset,ongoingDsetList):
                #print ' sub: ' + mitDataset
                if download != 1 and status != 1:
                    print ' handled by jobSitter -- ' + mitDataset
    
            elif     inList(mitDataset,completedDsetList):
                if not noInfo:
                    print ' don: ' + mitDataset
            else:
                if complete == 1:
                    cmd = cmd + ' --complete'
                    #print ' toc: ' + mitDataset
                    print ' completing: ' + cmd
                    if exe == 1:
                        os.system(cmd)

        # test download request
        if status != -1:
            cmd = 'status.sh ' + mitCfg + '/' + version + ' ' + mitDataset + ' ' + printOpt
            if exe == 1:
                rc = os.system(cmd)
            else:
                print " " + cmd
            printOpt = ""

        # test download request
        if local != "-" and download != -1:
            localPath  = local
            cmd = 'downloadSample.py --cmsDataset=' + cmsDataset + ' --mitCfg=' + mitCfg + \
                  " --version=" + version
            if cmssw != '':
                cmd = cmd + " --cmssw=" + cmssw
            if forceCopy:
                cmd += ' --forceCopy'
            print " " + cmd
            if exe == 1:
                rc = os.system(cmd)
            
if mitDataset == "":
    print "ERROR - dataset not defined."
    sys.exit(0)

sys.exit(0)
Revision:	1.10
Committed:	Tue Mar 22 02:48:51 2011 UTC (14 years, 1 month ago) by paus
Content type:	text/x-python
Branch:	MAIN
CVS Tags:	Mit_024b, Mit_025pre1, Mit_024a, Mit_024, Mit_023, Mit_022a, Mit_022, Mit_020d, TMit_020d, Mit_020c, Mit_021, Mit_021pre2, Mit_021pre1, Mit_020b, Mit_020a, Mit_020
Changes since 1.9:	+2 -2 lines
Log Message:	Version 020 updates (64 bit architecture).
#	Content
1	#!/usr/bin/env python
2	#---------------------------------------------------------------------------------------------------
3	# Script to identify all samples which are kept in our database.
4	#
5	# Author: C.Paus (September 23, 2008)
6	#---------------------------------------------------------------------------------------------------
7	import os,sys,getopt,re,string
8
9	def findStartedDatasets(path):
10	if debug == 1:
11	print " Collecting information over started samples"
12	datasetList = []
13
14	cmd = 'list ' + path
15	for line in os.popen(cmd).readlines(): # run command
16	line = line[:-1] # strip '\n'
17	f = line.split(" ")
18	size = f[0]
19	file = f[1]
20
21	#if debug == 1:
22	# print ' Debug:: adding: ' + file + ' with size ' + size
23	datasetList.append(file)
24
25	return datasetList
26
27	def findOngoingDatasets(path):
28	if debug == 1:
29	print " Collecting information over ongoing samples"
30	datasetList = []
31
32	cmd = 'cat crab_[0-9]_[0-9]_[0-9]/share/crab.cfg \|grep ^user_remote_dir'
33	for line in os.popen(cmd).readlines(): # run command
34	line = line[:-1] # strip '\n'
35	f = line.split("/")
36	dataset = f[-1]
37	if re.search('crab_0',dataset):
38	dataset = f[-2]
39
40	#if debug == 1:
41	# print ' Debug:: adding: ' + dataset '
42	datasetList.append(dataset)
43
44	return datasetList
45
46	def findCompletedDatasets(path):
47	if debug == 1:
48	print " Collecting information over completed samples"
49	datasetList = []
50
51	cmd = 'cat ' + os.environ['MIT_PROD_DIR'] + '/' + mitCfg + '/' + version + '/Completed 2> /dev/null'
52	for line in os.popen(cmd).readlines(): # run command
53	line = line[:-1] # strip '\n'
54	dataset = line
55
56	#if debug == 1:
57	# print ' Debug:: adding: ' + dataset '
58	datasetList.append(dataset)
59
60	return datasetList
61
62	def inList(file,list):
63	for entry in list:
64	if entry == file:
65	return True
66	return False
67
68	def cleanupCompletedList(ongoingDsetList,completedDsetList):
69	if debug == 1:
70	print " Update completed list with ongoing list"
71
72	for dataset in ongoingDsetList:
73	if inList(dataset,completedDsetList):
74	print ' -> removing fropm completed: ' + dataset
75	cmd = 'cat ' + mitCfg + '/' + version + '/Completed\|grep -v ^' + dataset + '$ > C.bak'
76	cmd += '; mv C.bak ' + mitCfg + '/' + version + '/Completed'
77	os.system(cmd)
78
79	#===================================================================================================
80	# Main starts here
81	#===================================================================================================
82	# Define string to explain usage of the script
83	usage = "\nUsage: findSamples.py --mitCfg=<name>\n"
84	usage += " --version=<version> [ default: MIT_VERS ]\n"
85	usage += " --cmssw=<name>\n"
86	usage += " --pattern=<name>\n"
87	usage += " --download=<int: -1,0,1>\n"
88	usage += " --status=<int: -1,0,1>\n"
89	usage += " --useExistingLfns\n"
90	usage += " --exe\n"
91	usage += " --noInfo\n"
92	usage += " --forceCopy\n"
93	usage += " --debug\n"
94	usage += " --help\n\n"
95
96	# Define the valid options which can be specified and check out the command line
97	valid = ['mitCfg=','version=','cmssw=','pattern=','download=','status=', \
98	'help','exe','useExistingLfns','complete','noInfo','forceCopy','debug']
99	try:
100	opts, args = getopt.getopt(sys.argv[1:], "", valid)
101	except getopt.GetoptError, ex:
102	print usage
103	print str(ex)
104	sys.exit(1)
105
106	# --------------------------------------------------------------------------------------------------
107	# Get all parameters for the production
108	# --------------------------------------------------------------------------------------------------
109	# Set defaults for each option
110	mitCfg = 'filefi'
111	version = os.environ['MIT_VERS']
112	cmssw = ''
113	pattern = ''
114	cmsswCfg = 'cmssw.cfg'
115	exe = 0
116	useExistingLfns = False
117	complete = 0
118	noInfo = False
119	download = -1
120	status = -1
121	forceCopy = False
122	debug = False
123
124	# Read new values from the command line
125	for opt, arg in opts:
126	if opt == "--help":
127	print usage
128	sys.exit(0)
129	if opt == "--mitCfg":
130	mitCfg = arg
131	if opt == "--version":
132	version = arg
133	if opt == "--cmssw":
134	cmssw = arg
135	if opt == "--pattern":
136	pattern = arg
137	if opt == "--exe":
138	exe = 1
139	if opt == "--useExistingLfns":
140	useExistingLfns = True
141	if opt == "--download":
142	download = int(arg)
143	if opt == "--status":
144	status = int(arg)
145	if opt == "--complete":
146	complete = 1
147	if opt == "--noInfo":
148	noInfo = True
149	if opt == "--forceCopy":
150	forceCopy = True
151	if opt == "--debug":
152	debug = True
153
154	# Read parameters needed
155	crabFile = os.environ['MIT_PROD_DIR'] + '/' + mitCfg + '/' + version + '/' + 'crab.cfg'
156	if not os.path.exists(crabFile):
157	cmd = "Crab file not found: %s" % crabFile
158	raise RuntimeError, cmd
159	cmsswFile = os.environ['MIT_PROD_DIR'] + '/' + mitCfg + '/' + version + '/' + cmsswCfg
160	if not os.path.exists(cmsswFile):
161	cmd = "Cmssw file not found: %s" % cmsswFile
162	cmsswCfg = 'cmssw.py'
163	cmsswFile = os.environ['MIT_PROD_DIR'] + '/' + mitCfg + '/' + version + '/' + cmsswCfg
164	if not os.path.exists(cmsswFile):
165	cmd = "Cmssw file not found: %s" % cmsswFile
166	cmd = " XXXX ERROR no valid configuration found XXXX"
167	raise RuntimeError, cmd
168
169	# Find all started samples
170	path = '/pnfs/cmsaf.mit.edu/t2bat/cms/store/user/paus/' + mitCfg + '/' + version
171	cmd = 'grep ^storage_element ' + crabFile + '\| grep cern.ch'
172	for line in os.popen(cmd).readlines():
173	path = '/castor/cern.ch/user/p/paus/' + mitCfg + '/' + version
174
175	startedDsetList = findStartedDatasets(path)
176	#print " Dataset list: "
177	#for dataset in startedDsetList:
178	# print ' -> ' + dataset
179
180	ongoingDsetList = findOngoingDatasets(path)
181	completedDsetList = findCompletedDatasets(path)
182	cleanupCompletedList(ongoingDsetList,completedDsetList)
183	completedDsetList = findCompletedDatasets(path)
184
185	# Resolve the other mitCfg parameters from the configuration file
186	cmd = 'cat '+ os.environ['MIT_PROD_DIR'] + '/' + mitCfg + '/' + version + '/' + 'Productions'
187	if cmssw != '':
188	cmd = cmd + '.' + cmssw
189
190
191	print ''
192
193	join = 0
194	mitDataset = ""
195	fullLine = ""
196	bSlash = "\\";
197	printOpt = "-header"
198	for line in os.popen(cmd).readlines(): # run command
199	line = line[:-1]
200	#print 'Line: "' + line + '"'
201	# get ride of empty lines
202	if line == '':
203	continue
204	# get ride of commented lines and read steering parameters
205	if line[0] == '#':
206	names = line.split() # splitting every blank
207	if len(names)> 2 and names[1] == 'crontab' and int(names[3]) != 1:
208	print 'No crontab has been set.... (LINE: ' + line + ')'
209	sys.exit(0)
210	continue
211
212	# join lines
213	if join == 1:
214	fullLine += line
215	else:
216	fullLine = line
217
218	# determine if finished or more is coming
219	if fullLine[-1] == bSlash:
220	join = 1
221	fullLine = fullLine[:-1]
222	else:
223	join = 0
224	fullLine = " ".join(str(fullLine).split()).strip()
225	# test whether there is a directory
226	names = fullLine.split() # splitting every blank
227	if debug == True:
228	print "FullLine: " + fullLine
229	cmsDataset = names[0]
230	mitDataset = names[1] # this is the equivalent MIT name of the dataset
231	nevents = int(names[2]) # number of events to be used in the production
232	procStatus = names[3]
233	local = names[4]
234
235	if pattern != '' and not re.search(pattern,mitDataset):
236	continue
237
238	# make sure we want to consider submission
239	if download != 1 and status != 1:
240	cmd = 'submit.py --mitDataset=' + mitDataset + ' --mitCfg=' + mitCfg + \
241	' --version=' + version + ' --noTestJob'
242	if cmssw != '':
243	cmd = cmd + " --cmssw=" + cmssw
244	if useExistingLfns:
245	cmd = cmd + " --useExistingLfns"
246
247	# check for errors (to be done)
248
249	# check for the logical combinations
250	if not inList(mitDataset,startedDsetList):
251	#print ' new: ' + mitDataset
252	print ' submitting: ' + cmd
253	if exe == 1:
254	os.system(cmd)
255
256	elif inList(mitDataset,ongoingDsetList):
257	#print ' sub: ' + mitDataset
258	if download != 1 and status != 1:
259	print ' handled by jobSitter -- ' + mitDataset
260
261	elif inList(mitDataset,completedDsetList):
262	if not noInfo:
263	print ' don: ' + mitDataset
264	else:
265	if complete == 1:
266	cmd = cmd + ' --complete'
267	#print ' toc: ' + mitDataset
268	print ' completing: ' + cmd
269	if exe == 1:
270	os.system(cmd)
271
272	# test download request
273	if status != -1:
274	cmd = 'status.sh ' + mitCfg + '/' + version + ' ' + mitDataset + ' ' + printOpt
275	if exe == 1:
276	rc = os.system(cmd)
277	else:
278	print " " + cmd
279	printOpt = ""
280
281	# test download request
282	if local != "-" and download != -1:
283	localPath = local
284	cmd = 'downloadSample.py --cmsDataset=' + cmsDataset + ' --mitCfg=' + mitCfg + \
285	" --version=" + version
286	if cmssw != '':
287	cmd = cmd + " --cmssw=" + cmssw
288	if forceCopy:
289	cmd += ' --forceCopy'
290	print " " + cmd
291	if exe == 1:
292	rc = os.system(cmd)
293
294	if mitDataset == "":
295	print "ERROR - dataset not defined."
296	sys.exit(0)
297
298	sys.exit(0)