1 |
mcinquil |
1.5 |
from PostMortem import PostMortem
|
2 |
|
|
|
3 |
spiga |
1.1 |
from crab_util import *
|
4 |
|
|
import common
|
5 |
mcinquil |
1.5 |
import string, os
|
6 |
|
|
|
7 |
|
|
from ProdCommon.Storage.SEAPI.SElement import SElement
|
8 |
|
|
from ProdCommon.Storage.SEAPI.SBinterface import SBinterface
|
9 |
|
|
|
10 |
spiga |
1.6 |
class PostMortemServer(PostMortem):
|
11 |
mcinquil |
1.5 |
def __init__(self, cfg_params, nj_list):
|
12 |
|
|
|
13 |
|
|
PostMortem.__init__(self, cfg_params, nj_list)
|
14 |
|
|
|
15 |
|
|
# init client server params...
|
16 |
|
|
CliServerParams(self)
|
17 |
|
|
|
18 |
mcinquil |
1.12 |
self.copyTout= setLcgTimeout()
|
19 |
|
|
if common.scheduler.name().upper() in ['LSF', 'CAF']:
|
20 |
|
|
self.copyTout= ' '
|
21 |
|
|
|
22 |
mcinquil |
1.5 |
if self.storage_path[0]!='/':
|
23 |
|
|
self.storage_path = '/'+self.storage_path
|
24 |
|
|
|
25 |
spiga |
1.1 |
return
|
26 |
|
|
|
27 |
mcinquil |
1.5 |
def collectLogging(self):
|
28 |
spiga |
1.6 |
# get updated status from server
|
29 |
|
|
try:
|
30 |
|
|
from StatusServer import StatusServer
|
31 |
|
|
stat = StatusServer(self.cfg_params)
|
32 |
mcinquil |
1.13 |
warning_msg = stat.resynchClientSide()
|
33 |
|
|
if warning_msg is not None:
|
34 |
|
|
common.logger.info(warning_msg)
|
35 |
spiga |
1.6 |
except:
|
36 |
|
|
pass
|
37 |
mcinquil |
1.5 |
|
38 |
|
|
#create once storage interaction object
|
39 |
|
|
seEl = None
|
40 |
|
|
loc = None
|
41 |
|
|
try:
|
42 |
|
|
seEl = SElement(self.storage_name, self.storage_proto, self.storage_port)
|
43 |
|
|
except Exception, ex:
|
44 |
spiga |
1.8 |
common.logger.debug( str(ex))
|
45 |
mcinquil |
1.5 |
msg = "ERROR: Unable to create SE source interface \n"
|
46 |
|
|
raise CrabException(msg)
|
47 |
|
|
try:
|
48 |
|
|
loc = SElement("localhost", "local")
|
49 |
|
|
except Exception, ex:
|
50 |
spiga |
1.8 |
common.logger.debug( str(ex))
|
51 |
mcinquil |
1.5 |
msg = "ERROR: Unable to create destination interface \n"
|
52 |
|
|
raise CrabException(msg)
|
53 |
spiga |
1.1 |
|
54 |
mcinquil |
1.5 |
## coupling se interfaces
|
55 |
|
|
sbi = SBinterface( seEl, loc )
|
56 |
spiga |
1.1 |
|
57 |
mcinquil |
1.5 |
## get the list of jobs to get logging.info skimmed by failed status
|
58 |
|
|
logginable = self.skimDeadList()
|
59 |
|
|
|
60 |
mcinquil |
1.12 |
if self.storage_proto in ['globus']:
|
61 |
|
|
for id in self.nj_list:
|
62 |
|
|
if id not in self.all_jobs:
|
63 |
|
|
common.logger.info('Warning: job # ' + str(id) + ' does not exist! Not possible to ask for postMortem ')
|
64 |
|
|
elif id not in logginable:
|
65 |
|
|
common.logger.info('Warning: job # ' + str(id) + ' not killed or aborted! Will get loggingInfo manually ')
|
66 |
|
|
PostMortem.collectOneLogging(self,id)
|
67 |
|
|
# construct a list of absolute paths of input files
|
68 |
|
|
# and the destinations to copy them to
|
69 |
|
|
sourcesList = []
|
70 |
|
|
destsList = []
|
71 |
|
|
self.taskuuid = str(common._db.queryTask('name'))
|
72 |
|
|
common.logger.debug( "Starting globus retrieval for task name: " + self.taskuuid)
|
73 |
|
|
remotedir = os.path.join(self.storage_path, self.taskuuid)
|
74 |
|
|
for i in logginable:
|
75 |
|
|
remotelog = remotedir + '/loggingInfo_'+str(i)+'.log'
|
76 |
|
|
sourcesList.append(remotelog)
|
77 |
|
|
fname = self.fname_base + str(i) + '.LoggingInfo'
|
78 |
|
|
destsList.append(fname)
|
79 |
|
|
|
80 |
|
|
# try to do the copy
|
81 |
|
|
copy_res = None
|
82 |
|
|
try:
|
83 |
|
|
copy_res = sbi.copy( sourcesList, destsList, opt=self.copyTout)
|
84 |
|
|
except Exception, ex:
|
85 |
|
|
msg = "WARNING: Unable to retrieve logging info file %s \n" % osbFiles[i]
|
86 |
|
|
msg += str(ex)
|
87 |
|
|
common.logger.debug(msg)
|
88 |
|
|
import traceback
|
89 |
|
|
common.logger.debug( str(traceback.format_exc()) )
|
90 |
|
|
if copy_res is not None:
|
91 |
|
|
## evaluating copy results
|
92 |
|
|
copy_err_list = []
|
93 |
|
|
count = 0
|
94 |
|
|
for ll in map(None, copy_res, sourcesList):
|
95 |
|
|
exitcode = int(ll[0][0])
|
96 |
|
|
if exitcode == 0:
|
97 |
|
|
## decode logging info
|
98 |
|
|
fl = open(destsList[count], 'r')
|
99 |
|
|
out = "".join(fl.readlines())
|
100 |
|
|
fl.close()
|
101 |
|
|
reason = self.decodeLogging(out)
|
102 |
|
|
common.logger.info('Logging info for job '+ str(logginable[count]) +': '+str(reason)+'\n written to '+str(destsList[count])+' \n' )
|
103 |
|
|
else:
|
104 |
|
|
common.logger.info('Logging info for job '+ str(logginable[count]) +' not retrieved. Tring to get loggingInfo manually')
|
105 |
|
|
PostMortem.collectOneLogging(self,logginable[count])
|
106 |
|
|
count += 1
|
107 |
|
|
else:
|
108 |
|
|
## iter over each asked job and print warning if not in skimmed list
|
109 |
|
|
for id in self.nj_list:
|
110 |
|
|
if id not in self.all_jobs:
|
111 |
|
|
common.logger.info('Warning: job # ' + str(id) + ' does not exist! Not possible to ask for postMortem ')
|
112 |
mcinquil |
1.5 |
continue
|
113 |
mcinquil |
1.12 |
elif id in logginable:
|
114 |
|
|
fname = self.fname_base + str(id) + '.LoggingInfo'
|
115 |
|
|
if os.path.exists(fname):
|
116 |
|
|
common.logger.info('Logging info for job ' + str(id) + ' already present in '+fname+'\nRemove it for update')
|
117 |
|
|
continue
|
118 |
|
|
## retrieving & processing logging info
|
119 |
|
|
if self.retrieveFile( sbi, id, fname):
|
120 |
|
|
## decode logging info
|
121 |
|
|
fl = open(fname, 'r')
|
122 |
|
|
out = "".join(fl.readlines())
|
123 |
|
|
fl.close()
|
124 |
|
|
reason = self.decodeLogging(out)
|
125 |
|
|
common.logger.info('Logging info for job '+ str(id) +': '+str(reason)+'\n written to '+str(fname)+' \n' )
|
126 |
|
|
else:
|
127 |
|
|
common.logger.info('Logging info for job '+ str(id) +' not retrieved. Tring to get loggingInfo manually')
|
128 |
|
|
PostMortem.collectOneLogging(self,id)
|
129 |
mcinquil |
1.5 |
else:
|
130 |
mcinquil |
1.12 |
common.logger.info('Warning: job # ' + str(id) + ' not killed or aborted! Will get loggingInfo manually ')
|
131 |
slacapra |
1.10 |
PostMortem.collectOneLogging(self,id)
|
132 |
mcinquil |
1.5 |
return
|
133 |
spiga |
1.1 |
|
134 |
|
|
|
135 |
mcinquil |
1.5 |
def skimDeadList(self):
|
136 |
|
|
"""
|
137 |
|
|
__skimDeadList__
|
138 |
|
|
return the list of jobs really failed: K, A
|
139 |
|
|
"""
|
140 |
|
|
skimmedlist = []
|
141 |
|
|
self.up_task = common._db.getTask( self.nj_list )
|
142 |
|
|
for job in self.up_task.jobs:
|
143 |
|
|
if job.runningJob['status'] in ['K','A']:
|
144 |
|
|
skimmedlist.append(job['jobId'])
|
145 |
|
|
return skimmedlist
|
146 |
|
|
|
147 |
|
|
def retrieveFile(self, sbi, jobid, destlog):
|
148 |
|
|
"""
|
149 |
|
|
__retrieveFile__
|
150 |
spiga |
1.1 |
|
151 |
mcinquil |
1.5 |
retrieves logging.info file from the server storage area
|
152 |
|
|
"""
|
153 |
|
|
self.taskuuid = str(common._db.queryTask('name'))
|
154 |
spiga |
1.8 |
common.logger.debug( "Task name: " + self.taskuuid)
|
155 |
spiga |
1.1 |
|
156 |
mcinquil |
1.5 |
# full remote dir
|
157 |
|
|
remotedir = os.path.join(self.storage_path, self.taskuuid)
|
158 |
|
|
remotelog = remotedir + '/loggingInfo_'+str(jobid)+'.log'
|
159 |
|
|
|
160 |
spiga |
1.8 |
common.logger.info("Starting retrieving logging-info from server " \
|
161 |
mcinquil |
1.5 |
+ str(self.storage_name) + " for job " \
|
162 |
|
|
+ str(jobid) + "...")
|
163 |
spiga |
1.1 |
|
164 |
mcinquil |
1.5 |
# retrieve logging info from storage
|
165 |
spiga |
1.8 |
common.logger.debug( "retrieving "+ str(remotelog) +" to "+ str(destlog) )
|
166 |
mcinquil |
1.5 |
try:
|
167 |
|
|
sbi.copy( remotelog, destlog)
|
168 |
|
|
except Exception, ex:
|
169 |
|
|
msg = "WARNING: Unable to retrieve logging-info file %s \n"%remotelog
|
170 |
|
|
msg += str(ex)
|
171 |
spiga |
1.8 |
common.logger.debug(msg)
|
172 |
mcinquil |
1.5 |
return False
|
173 |
|
|
# cleaning remote logging info file
|
174 |
|
|
try:
|
175 |
mcinquil |
1.11 |
common.logger.debug( "Cleaning remote file [%s] " %( str(remotelog) ) )
|
176 |
mcinquil |
1.5 |
sbi.delete(remotelog)
|
177 |
|
|
except Exception, ex:
|
178 |
|
|
msg = "WARNING: Unable to clean remote logging-info file %s \n"%remotelog
|
179 |
|
|
msg += str(ex)
|
180 |
spiga |
1.8 |
common.logger.debug(msg)
|
181 |
mcinquil |
1.5 |
return True
|