1 |
|
2 |
"""
|
3 |
* ApMon - Application Monitoring Tool
|
4 |
* Version: 2.2.1
|
5 |
*
|
6 |
* Copyright (C) 2006 California Institute of Technology
|
7 |
*
|
8 |
* Permission is hereby granted, free of charge, to use, copy and modify
|
9 |
* this software and its documentation (the "Software") for any
|
10 |
* purpose, provided that existing copyright notices are retained in
|
11 |
* all copies and that this notice is included verbatim in any distributions
|
12 |
* or substantial portions of the Software.
|
13 |
* This software is a part of the MonALISA framework (http://monalisa.cacr.caltech.edu).
|
14 |
* Users of the Software are asked to feed back problems, benefits,
|
15 |
* and/or suggestions about the software to the MonALISA Development Team
|
16 |
* (developers@monalisa.cern.ch). Support for this software - fixing of bugs,
|
17 |
* incorporation of new features - is done on a best effort basis. All bug
|
18 |
* fixes and enhancements will be made available under the same terms and
|
19 |
* conditions as the original software,
|
20 |
|
21 |
* IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY FOR
|
22 |
* DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT
|
23 |
* OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY DERIVATIVES THEREOF,
|
24 |
* EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
25 |
|
26 |
* THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
|
27 |
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
|
28 |
* FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE IS
|
29 |
* PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE NO
|
30 |
* OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
|
31 |
* MODIFICATIONS.
|
32 |
"""
|
33 |
|
34 |
|
35 |
import os
|
36 |
import re
|
37 |
import time
|
38 |
import string
|
39 |
import socket
|
40 |
import Logger
|
41 |
|
42 |
"""
|
43 |
Class ProcInfo
|
44 |
extracts information from the proc/ filesystem for system and job monitoring
|
45 |
"""
|
46 |
class ProcInfo:
|
47 |
# ProcInfo constructor
|
48 |
def __init__ (this, logger):
|
49 |
this.DATA = {}; # monitored data that is going to be reported
|
50 |
this.LAST_UPDATE_TIME = 0; # when the last measurement was done
|
51 |
this.JOBS = {}; # jobs that will be monitored
|
52 |
this.logger = logger # use the given logger
|
53 |
this.OS_TYPE = os.popen('uname -s').readline().replace('\n','');
|
54 |
|
55 |
# This should be called from time to time to update the monitored data,
|
56 |
# but not more often than once a second because of the resolution of time()
|
57 |
def update (this):
|
58 |
if this.LAST_UPDATE_TIME == int(time.time()):
|
59 |
this.logger.log(Logger.NOTICE, "ProcInfo: update() called too often!");
|
60 |
return;
|
61 |
this.readStat();
|
62 |
this.readMemInfo();
|
63 |
if this.OS_TYPE == 'Darwin':
|
64 |
this.darwin_readLoadAvg();
|
65 |
else:
|
66 |
this.readLoadAvg();
|
67 |
this.countProcesses();
|
68 |
this.readGenericInfo();
|
69 |
this.readNetworkInfo();
|
70 |
this.readNetStat();
|
71 |
for pid in this.JOBS.keys():
|
72 |
this.readJobInfo(pid);
|
73 |
this.readJobDiskUsage(pid);
|
74 |
this.LAST_UPDATE_TIME = int(time.time());
|
75 |
this.DATA['TIME'] = int(time.time());
|
76 |
|
77 |
# Call this to add another PID to be monitored
|
78 |
def addJobToMonitor (this, pid, workDir):
|
79 |
this.JOBS[pid] = {};
|
80 |
this.JOBS[pid]['WORKDIR'] = workDir;
|
81 |
this.JOBS[pid]['DATA'] = {};
|
82 |
#print this.JOBS;
|
83 |
|
84 |
# Call this to stop monitoring a PID
|
85 |
def removeJobToMonitor (this, pid):
|
86 |
if this.JOBS.has_key(pid):
|
87 |
del this.JOBS[pid];
|
88 |
|
89 |
# Return a filtered hash containting the system-related parameters and values
|
90 |
def getSystemData (this, params, prevDataRef):
|
91 |
return this.getFilteredData(this.DATA, params, prevDataRef);
|
92 |
|
93 |
# Return a filtered hash containing the job-related parameters and values
|
94 |
def getJobData (this, pid, params):
|
95 |
if not this.JOBS.has_key(pid):
|
96 |
return [];
|
97 |
return this.getFilteredData(this.JOBS[pid]['DATA'], params);
|
98 |
|
99 |
############################################################################################
|
100 |
# internal functions for system monitoring
|
101 |
############################################################################################
|
102 |
|
103 |
# this has to be run twice (with the $lastUpdateTime updated) to get some useful results
|
104 |
# the information about pages_in/out and swap_in/out isn't available for 2.6 kernels (yet)
|
105 |
def readStat (this):
|
106 |
try:
|
107 |
FSTAT = open('/proc/stat');
|
108 |
line = FSTAT.readline();
|
109 |
while(line != ''):
|
110 |
if(line.startswith("cpu ")):
|
111 |
elem = re.split("\s+", line);
|
112 |
this.DATA['raw_cpu_usr'] = float(elem[1]);
|
113 |
this.DATA['raw_cpu_nice'] = float(elem[2]);
|
114 |
this.DATA['raw_cpu_sys'] = float(elem[3]);
|
115 |
this.DATA['raw_cpu_idle'] = float(elem[4]);
|
116 |
if(line.startswith("page")):
|
117 |
elem = line.split();
|
118 |
this.DATA['raw_pages_in'] = float(elem[1]);
|
119 |
this.DATA['raw_pages_out'] = float(elem[2]);
|
120 |
if(line.startswith('swap')):
|
121 |
elem = line.split();
|
122 |
this.DATA['raw_swap_in'] = float(elem[1]);
|
123 |
this.DATA['raw_swap_out'] = float(elem[2]);
|
124 |
line = FSTAT.readline();
|
125 |
FSTAT.close();
|
126 |
except IOError, ex:
|
127 |
this.logger.log(Logger.ERROR, "ProcInfo: cannot open /proc/stat");
|
128 |
return;
|
129 |
|
130 |
# sizes are reported in MB (except _usage that is in percent).
|
131 |
def readMemInfo (this):
|
132 |
try:
|
133 |
FMEM = open('/proc/meminfo');
|
134 |
line = FMEM.readline();
|
135 |
while(line != ''):
|
136 |
elem = re.split("\s+", line);
|
137 |
if(line.startswith("MemFree:")):
|
138 |
this.DATA['mem_free'] = float(elem[1]) / 1024.0;
|
139 |
if(line.startswith("MemTotal:")):
|
140 |
this.DATA['total_mem'] = float(elem[1]) / 1024.0;
|
141 |
if(line.startswith("SwapFree:")):
|
142 |
this.DATA['swap_free'] = float(elem[1]) / 1024.0;
|
143 |
if(line.startswith("SwapTotal:")):
|
144 |
this.DATA['total_swap'] = float(elem[1]) / 1024.0;
|
145 |
line = FMEM.readline();
|
146 |
FMEM.close();
|
147 |
if this.DATA.has_key('total_mem') and this.DATA.has_key('mem_free'):
|
148 |
this.DATA['mem_used'] = this.DATA['total_mem'] - this.DATA['mem_free'];
|
149 |
if this.DATA.has_key('total_swap') and this.DATA.has_key('swap_free'):
|
150 |
this.DATA['swap_used'] = this.DATA['total_swap'] - this.DATA['swap_free'];
|
151 |
if this.DATA.has_key('mem_used') and this.DATA.has_key('total_mem') and this.DATA['total_mem'] > 0:
|
152 |
this.DATA['mem_usage'] = 100.0 * this.DATA['mem_used'] / this.DATA['total_mem'];
|
153 |
if this.DATA.has_key('swap_used') and this.DATA.has_key('total_swap') and this.DATA['total_swap'] > 0:
|
154 |
this.DATA['swap_usage'] = 100.0 * this.DATA['swap_used'] / this.DATA['total_swap'];
|
155 |
except IOError, ex:
|
156 |
this.logger.log(Logger.ERROR, "ProcInfo: cannot open /proc/meminfo");
|
157 |
return;
|
158 |
|
159 |
# read system load average
|
160 |
def readLoadAvg (this):
|
161 |
try:
|
162 |
FAVG = open('/proc/loadavg');
|
163 |
line = FAVG.readline();
|
164 |
FAVG.close();
|
165 |
elem = re.split("\s+", line);
|
166 |
this.DATA['load1'] = float(elem[0]);
|
167 |
this.DATA['load5'] = float(elem[1]);
|
168 |
this.DATA['load15'] = float(elem[2]);
|
169 |
except IOError, ex:
|
170 |
this.logger.log(Logger.ERROR, "ProcInfo: cannot open /proc/meminfo");
|
171 |
return;
|
172 |
|
173 |
|
174 |
# read system load average on Darwin
|
175 |
def darwin_readLoadAvg (this):
|
176 |
try:
|
177 |
LOAD_AVG = os.popen('sysctl vm.loadavg');
|
178 |
line = LOAD_AVG.readline();
|
179 |
LOAD_AVG.close();
|
180 |
elem = re.split("\s+", line);
|
181 |
this.DATA['load1'] = float(elem[1]);
|
182 |
this.DATA['load5'] = float(elem[2]);
|
183 |
this.DATA['load15'] = float(elem[3]);
|
184 |
except IOError, ex:
|
185 |
this.logger.log(Logger.ERROR, "ProcInfo: cannot run 'sysctl vm.loadavg");
|
186 |
return;
|
187 |
|
188 |
|
189 |
# read the number of processes currently running on the system
|
190 |
def countProcesses (this):
|
191 |
"""
|
192 |
# old version
|
193 |
nr = 0;
|
194 |
try:
|
195 |
for file in os.listdir("/proc"):
|
196 |
if re.match("\d+", file):
|
197 |
nr += 1;
|
198 |
this.DATA['processes'] = nr;
|
199 |
except IOError, ex:
|
200 |
this.logger.log(Logger.ERROR, "ProcInfo: cannot open /proc to count processes");
|
201 |
return;
|
202 |
"""
|
203 |
# new version
|
204 |
total = 0;
|
205 |
states = {'D':0, 'R':0, 'S':0, 'T':0, 'Z':0};
|
206 |
try:
|
207 |
output = os.popen('ps -A -o state');
|
208 |
line = output.readline();
|
209 |
while(line != ''):
|
210 |
states[line[0]] = states[line[0]] + 1;
|
211 |
total = total + 1;
|
212 |
line = output.readline();
|
213 |
output.close();
|
214 |
this.DATA['processes'] = total;
|
215 |
for key in states.keys():
|
216 |
this.DATA['processes_'+key] = states[key];
|
217 |
except IOError, ex:
|
218 |
this.logger.log(Logger.ERROR, "ProcInfo: cannot get output from ps command");
|
219 |
return;
|
220 |
|
221 |
# reads the IP, hostname, cpu_MHz, uptime
|
222 |
def readGenericInfo (this):
|
223 |
this.DATA['hostname'] = socket.getfqdn();
|
224 |
try:
|
225 |
output = os.popen('/sbin/ifconfig -a')
|
226 |
eth, ip = '', '';
|
227 |
line = output.readline();
|
228 |
while(line != ''):
|
229 |
line = line.strip();
|
230 |
if line.startswith("eth"):
|
231 |
elem = line.split();
|
232 |
eth = elem[0];
|
233 |
ip = '';
|
234 |
if len(eth) > 0 and line.startswith("inet addr:"):
|
235 |
ip = re.match("inet addr:(\d+\.\d+\.\d+\.\d+)", line).group(1);
|
236 |
this.DATA[eth + '_ip'] = ip;
|
237 |
eth = '';
|
238 |
line = output.readline();
|
239 |
output.close();
|
240 |
except IOError, ex:
|
241 |
this.logger.log(Logger.ERROR, "ProcInfo: cannot get output from /sbin/ifconfig -a");
|
242 |
return;
|
243 |
try:
|
244 |
no_cpus = 0;
|
245 |
FCPU = open('/proc/cpuinfo');
|
246 |
line = FCPU.readline();
|
247 |
while(line != ''):
|
248 |
if line.startswith("cpu MHz"):
|
249 |
this.DATA['cpu_MHz'] = float(re.match("cpu MHz\s+:\s+(\d+\.?\d*)", line).group(1));
|
250 |
no_cpus += 1;
|
251 |
|
252 |
if line.startswith("vendor_id"):
|
253 |
this.DATA['cpu_vendor_id'] = re.match("vendor_id\s+:\s+(.+)", line).group(1);
|
254 |
|
255 |
if line.startswith("cpu family"):
|
256 |
this.DATA['cpu_family'] = re.match("cpu family\s+:\s+(.+)", line).group(1);
|
257 |
|
258 |
if line.startswith("model") and not line.startswith("model name") :
|
259 |
this.DATA['cpu_model'] = re.match("model\s+:\s+(.+)", line).group(1);
|
260 |
|
261 |
if line.startswith("model name"):
|
262 |
this.DATA['cpu_model_name'] = re.match("model name\s+:\s+(.+)", line).group(1);
|
263 |
|
264 |
if line.startswith("bogomips"):
|
265 |
this.DATA['bogomips'] = float(re.match("bogomips\s+:\s+(\d+\.?\d*)", line).group(1));
|
266 |
|
267 |
line = FCPU.readline();
|
268 |
FCPU.close();
|
269 |
this.DATA['no_CPUs'] = no_cpus;
|
270 |
except IOError, ex:
|
271 |
this.logger.log(Logger.ERROR, "ProcInfo: cannot open /proc/cpuinfo");
|
272 |
return;
|
273 |
try:
|
274 |
FUPT = open('/proc/uptime');
|
275 |
line = FUPT.readline();
|
276 |
FUPT.close();
|
277 |
elem = line.split();
|
278 |
this.DATA['uptime'] = float(elem[0]) / (24.0 * 3600);
|
279 |
except IOError, ex:
|
280 |
this.logger.log(Logger.ERROR, "ProcInfo: cannot open /proc/uptime");
|
281 |
return;
|
282 |
|
283 |
# do a difference with overflow check and repair
|
284 |
# the counter is unsigned 32 or 64 bit
|
285 |
def diffWithOverflowCheck(this, new, old):
|
286 |
if new >= old:
|
287 |
return new - old;
|
288 |
else:
|
289 |
max = (1L << 31) * 2; # 32 bits
|
290 |
if old >= max:
|
291 |
max = (1L << 63) * 2; # 64 bits
|
292 |
return new - old + max;
|
293 |
|
294 |
# read network information like transfered kBps and nr. of errors on each interface
|
295 |
def readNetworkInfo (this):
|
296 |
try:
|
297 |
FNET = open('/proc/net/dev');
|
298 |
line = FNET.readline();
|
299 |
while(line != ''):
|
300 |
m = re.match("\s*eth(\d):(\d+)\s+\d+\s+(\d+)\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+(\d+)\s+\d+\s+(\d+)", line);
|
301 |
if m != None:
|
302 |
this.DATA['raw_eth'+m.group(1)+'_in'] = float(m.group(2));
|
303 |
this.DATA['raw_eth'+m.group(1)+'_out'] = float(m.group(4));
|
304 |
this.DATA['raw_eth'+m.group(1)+'_errs'] = int(m.group(3)) + int(m.group(5));
|
305 |
line = FNET.readline();
|
306 |
FNET.close();
|
307 |
except IOError, ex:
|
308 |
this.logger.log(Logger.ERROR, "ProcInfo: cannot open /proc/net/dev");
|
309 |
return;
|
310 |
|
311 |
# run nestat and collect sockets info (tcp, udp, unix) and connection states for tcp sockets from netstat
|
312 |
def readNetStat(this):
|
313 |
try:
|
314 |
output = os.popen('netstat -an 2>/dev/null');
|
315 |
sockets = { 'sockets_tcp':0, 'sockets_udp':0, 'sockets_unix':0, 'sockets_icm':0 };
|
316 |
tcp_details = { 'sockets_tcp_ESTABLISHED':0, 'sockets_tcp_SYN_SENT':0,
|
317 |
'sockets_tcp_SYN_RECV':0, 'sockets_tcp_FIN_WAIT1':0, 'sockets_tcp_FIN_WAIT2':0,
|
318 |
'sockets_tcp_TIME_WAIT':0, 'sockets_tcp_CLOSED':0, 'sockets_tcp_CLOSE_WAIT':0,
|
319 |
'sockets_tcp_LAST_ACK':0, 'sockets_tcp_LISTEN':0, 'sockets_tcp_CLOSING':0,
|
320 |
'sockets_tcp_UNKNOWN':0 };
|
321 |
line = output.readline();
|
322 |
while(line != ''):
|
323 |
arg = string.split(line);
|
324 |
proto = arg[0];
|
325 |
if proto.find('tcp') == 0:
|
326 |
sockets['sockets_tcp'] += 1;
|
327 |
state = arg[len(arg)-1];
|
328 |
key = 'sockets_tcp_'+state;
|
329 |
if tcp_details.has_key(key):
|
330 |
tcp_details[key] += 1;
|
331 |
if proto.find('udp') == 0:
|
332 |
sockets['sockets_udp'] += 1;
|
333 |
if proto.find('unix') == 0:
|
334 |
sockets['sockets_unix'] += 1;
|
335 |
if proto.find('icm') == 0:
|
336 |
sockets['sockets_icm'] += 1;
|
337 |
|
338 |
line = output.readline();
|
339 |
output.close();
|
340 |
|
341 |
for key in sockets.keys():
|
342 |
this.DATA[key] = sockets[key];
|
343 |
for key in tcp_details.keys():
|
344 |
this.DATA[key] = tcp_details[key];
|
345 |
except IOError, ex:
|
346 |
this.logger.log(Logger.ERROR, "ProcInfo: cannot get output from netstat command");
|
347 |
return;
|
348 |
|
349 |
##############################################################################################
|
350 |
# job monitoring related functions
|
351 |
##############################################################################################
|
352 |
|
353 |
# internal function that gets the full list of children (pids) for a process (pid)
|
354 |
def getChildren (this, parent):
|
355 |
pidmap = {};
|
356 |
try:
|
357 |
output = os.popen('ps -A -o "pid ppid"');
|
358 |
line = output.readline(); # skip headers
|
359 |
line = output.readline();
|
360 |
while(line != ''):
|
361 |
line = line.strip();
|
362 |
elem = re.split("\s+", line);
|
363 |
pidmap[elem[0]] = elem[1];
|
364 |
line = output.readline();
|
365 |
output.close();
|
366 |
except IOError, ex:
|
367 |
this.logger.log(Logger.ERROR, "ProcInfo: cannot execute ps -A -o \"pid ppid\"");
|
368 |
|
369 |
if pidmap.has_key(parent):
|
370 |
this.logger.log(Logger.INFO, 'ProcInfo: No job with pid='+str(parent));
|
371 |
this.removeJobToMonitor(parent);
|
372 |
return [];
|
373 |
|
374 |
children = [parent];
|
375 |
i = 0;
|
376 |
while(i < len(children)):
|
377 |
prnt = children[i];
|
378 |
for (pid, ppid) in pidmap.items():
|
379 |
if ppid == prnt:
|
380 |
children.append(pid);
|
381 |
i += 1;
|
382 |
return children;
|
383 |
|
384 |
# internal function that parses a time formatted like "days-hours:min:sec" and returns the corresponding
|
385 |
# number of seconds.
|
386 |
def parsePSTime (this, my_time):
|
387 |
my_time = my_time.strip();
|
388 |
m = re.match("(\d+)-(\d+):(\d+):(\d+)", my_time);
|
389 |
if m != None:
|
390 |
return int(m.group(1)) * 24 * 3600 + int(m.group(2)) * 3600 + int(m.group(3)) * 60 + int(m.group(4));
|
391 |
else:
|
392 |
m = re.match("(\d+):(\d+):(\d+)", my_time);
|
393 |
if(m != None):
|
394 |
return int(m.group(1)) * 3600 + int(m.group(2)) * 60 + int(m.group(3));
|
395 |
else:
|
396 |
m = re.match("(\d+):(\d+)", my_time);
|
397 |
if(m != None):
|
398 |
return int(m.group(1)) * 60 + int(m.group(2));
|
399 |
else:
|
400 |
return 0;
|
401 |
|
402 |
# read information about this the JOB_PID process
|
403 |
# memory sizes are given in KB
|
404 |
def readJobInfo (this, pid):
|
405 |
if (pid == '') or not this.JOBS.has_key(pid):
|
406 |
return;
|
407 |
children = this.getChildren(pid);
|
408 |
if(len(children) == 0):
|
409 |
this.logger.log(Logger.INFO, "ProcInfo: Job with pid="+str(pid)+" terminated; removing it from monitored jobs.");
|
410 |
#print ":("
|
411 |
this.removeJobToMonitor(pid);
|
412 |
return;
|
413 |
try:
|
414 |
JSTATUS = os.popen("ps --no-headers --pid " + ",".join([`child` for child in children]) + " -o pid,etime,time,%cpu,%mem,rsz,vsz,comm");
|
415 |
mem_cmd_map = {};
|
416 |
etime, cputime, pcpu, pmem, rsz, vsz, comm, fd = 0, 0, 0, 0, 0, 0, 0, 0;
|
417 |
line = JSTATUS.readline();
|
418 |
while(line != ''):
|
419 |
line = line.strip();
|
420 |
m = re.match("(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(.+)", line);
|
421 |
if m != None:
|
422 |
apid, etime1, cputime1, pcpu1, pmem1, rsz1, vsz1, comm1 = m.group(1), m.group(2), m.group(3), m.group(4), m.group(5), m.group(6), m.group(7), m.group(8);
|
423 |
sec = this.parsePSTime(etime1);
|
424 |
if sec > etime: # the elapsed time is the maximum of all elapsed
|
425 |
etime = sec;
|
426 |
sec = this.parsePSTime(cputime1); # times corespornding to all child processes.
|
427 |
cputime += sec; # total cputime is the sum of cputimes for all processes.
|
428 |
pcpu += float(pcpu1); # total %cpu is the sum of all children %cpu.
|
429 |
if not mem_cmd_map.has_key(`pmem1`+" "+`rsz1`+" "+`vsz1`+" "+`comm1`):
|
430 |
# it's the first thread/process with this memory footprint; add it.
|
431 |
mem_cmd_map[`pmem1`+" "+`rsz1`+" "+`vsz1`+" "+`comm1`] = 1;
|
432 |
pmem += float(pmem1); rsz += int(rsz1); vsz += int(vsz1);
|
433 |
fd += this.countOpenFD(apid);
|
434 |
# else not adding memory usage
|
435 |
line = JSTATUS.readline();
|
436 |
JSTATUS.close();
|
437 |
this.JOBS[pid]['DATA']['run_time'] = etime;
|
438 |
this.JOBS[pid]['DATA']['cpu_time'] = cputime;
|
439 |
this.JOBS[pid]['DATA']['cpu_usage'] = pcpu;
|
440 |
this.JOBS[pid]['DATA']['mem_usage'] = pmem;
|
441 |
this.JOBS[pid]['DATA']['rss'] = rsz;
|
442 |
this.JOBS[pid]['DATA']['virtualmem'] = vsz;
|
443 |
this.JOBS[pid]['DATA']['open_files'] = fd;
|
444 |
except IOError, ex:
|
445 |
this.logger.log(Logger.ERROR, "ProcInfo: cannot execute ps --no-headers -eo \"pid ppid\"");
|
446 |
|
447 |
# count the number of open files for the given pid
|
448 |
def countOpenFD (this, pid):
|
449 |
dir = '/proc/'+str(pid)+'/fd';
|
450 |
if os.access(dir, os.F_OK):
|
451 |
if os.access(dir, os.X_OK):
|
452 |
list = os.listdir(dir);
|
453 |
open_files = len(list);
|
454 |
if pid == os.getpid():
|
455 |
open_files -= 2;
|
456 |
this.logger.log(Logger.DEBUG, "Counting open_files for "+ `pid` +": "+ str(len(list)) +" => " + `open_files` + " open_files");
|
457 |
return open_files;
|
458 |
else:
|
459 |
this.logger.log(Logger.ERROR, "ProcInfo: cannot count the number of opened files for job "+`pid`);
|
460 |
else:
|
461 |
this.logger.log(Logger.ERROR, "ProcInfo: job "+`pid`+" dosen't exist");
|
462 |
|
463 |
|
464 |
# if there is an work directory defined, then compute the used space in that directory
|
465 |
# and the free disk space on the partition to which that directory belongs
|
466 |
# sizes are given in MB
|
467 |
def readJobDiskUsage (this, pid):
|
468 |
if (pid == '') or not this.JOBS.has_key(pid):
|
469 |
return;
|
470 |
workDir = this.JOBS[pid]['WORKDIR'];
|
471 |
if workDir == '':
|
472 |
return;
|
473 |
try:
|
474 |
DU = os.popen("du -Lsck " + workDir + " | tail -1 | cut -f 1");
|
475 |
line = DU.readline();
|
476 |
this.JOBS[pid]['DATA']['workdir_size'] = int(line) / 1024.0;
|
477 |
except IOError, ex:
|
478 |
this.logger.log(Logger.ERROR, "ERROR", "ProcInfo: cannot run du to get job's disk usage for job "+`pid`);
|
479 |
try:
|
480 |
DF = os.popen("df -k "+workDir+" | tail -1");
|
481 |
line = DF.readline().strip();
|
482 |
m = re.match("\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)%", line);
|
483 |
if m != None:
|
484 |
this.JOBS[pid]['DATA']['disk_total'] = float(m.group(1)) / 1024.0;
|
485 |
this.JOBS[pid]['DATA']['disk_used'] = float(m.group(2)) / 1024.0;
|
486 |
this.JOBS[pid]['DATA']['disk_free'] = float(m.group(3)) / 1024.0;
|
487 |
this.JOBS[pid]['DATA']['disk_usage'] = float(m.group(4)) / 1024.0;
|
488 |
DF.close();
|
489 |
except IOError, ex:
|
490 |
this.logger.log(Logger.ERROR, "ERROR", "ProcInfo: cannot run df to get job's disk usage for job "+`pid`);
|
491 |
|
492 |
# create cummulative parameters based on raw params like cpu_, pages_, swap_, or ethX_
|
493 |
def computeCummulativeParams(this, dataRef, prevDataRef):
|
494 |
if prevDataRef == {}:
|
495 |
for key in dataRef.keys():
|
496 |
if key.find('raw_') == 0:
|
497 |
prevDataRef[key] = dataRef[key];
|
498 |
prevDataRef['TIME'] = dataRef['TIME'];
|
499 |
return;
|
500 |
|
501 |
# cpu -related params
|
502 |
if (dataRef.has_key('raw_cpu_usr')) and (prevDataRef.has_key('raw_cpu_usr')):
|
503 |
diff={};
|
504 |
cpu_sum = 0;
|
505 |
for param in ['cpu_usr', 'cpu_nice', 'cpu_sys', 'cpu_idle']:
|
506 |
diff[param] = this.diffWithOverflowCheck(dataRef['raw_'+param], prevDataRef['raw_'+param]);
|
507 |
cpu_sum += diff[param];
|
508 |
for param in ['cpu_usr', 'cpu_nice', 'cpu_sys', 'cpu_idle']:
|
509 |
if cpu_sum != 0:
|
510 |
dataRef[param] = 100.0 * diff[param] / cpu_sum;
|
511 |
else:
|
512 |
del dataRef[param];
|
513 |
if cpu_sum != 0:
|
514 |
dataRef['cpu_usage'] = 100.0 * (cpu_sum - diff['cpu_idle']) / cpu_sum;
|
515 |
else:
|
516 |
del dataRef['cpu_usage'];
|
517 |
|
518 |
# swap & pages -related params
|
519 |
if (dataRef.has_key('raw_pages_in')) and (prevDataRef.has_key('raw_pages_in')):
|
520 |
interval = dataRef['TIME'] - prevDataRef['TIME'];
|
521 |
for param in ['pages_in', 'pages_out', 'swap_in', 'swap_out']:
|
522 |
diff = this.diffWithOverflowCheck(dataRef['raw_'+param], prevDataRef['raw_'+param]);
|
523 |
if interval != 0:
|
524 |
dataRef[param] = 1000.0 * diff / interval;
|
525 |
else:
|
526 |
del dataRef[param];
|
527 |
|
528 |
# eth - related params
|
529 |
interval = dataRef['TIME'] - prevDataRef['TIME'];
|
530 |
for rawParam in dataRef.keys():
|
531 |
if (rawParam.find('raw_eth') == 0) and prevDataRef.has_key(rawParam):
|
532 |
param = rawParam.split('raw_')[1];
|
533 |
if interval != 0:
|
534 |
dataRef[param] = this.diffWithOverflowCheck(dataRef[rawParam], prevDataRef[rawParam]); # absolute difference
|
535 |
if param.find('_errs') == -1:
|
536 |
dataRef[param] = dataRef[param] / interval / 1024.0; # if it's _in or _out, compute in KB/sec
|
537 |
else:
|
538 |
del dataRef[param];
|
539 |
|
540 |
# copy contents of the current data values to the
|
541 |
for param in dataRef.keys():
|
542 |
if param.find('raw_') == 0:
|
543 |
prevDataRef[param] = dataRef[param];
|
544 |
prevDataRef['TIME'] = dataRef['TIME'];
|
545 |
|
546 |
|
547 |
# Return a hash containing (param,value) pairs with existing values from the requested ones
|
548 |
def getFilteredData (this, dataHash, paramsList, prevDataHash = None):
|
549 |
|
550 |
if not prevDataHash is None:
|
551 |
this.computeCummulativeParams(dataHash, prevDataHash);
|
552 |
|
553 |
result = {};
|
554 |
for param in paramsList:
|
555 |
if param == 'net_sockets':
|
556 |
for key in dataHash.keys():
|
557 |
if key.find('sockets') == 0 and key.find('sockets_tcp_') == -1:
|
558 |
result[key] = dataHash[key];
|
559 |
elif param == 'net_tcp_details':
|
560 |
for key in dataHash.keys():
|
561 |
if key.find('sockets_tcp_') == 0:
|
562 |
result[key] = dataHash[key];
|
563 |
|
564 |
m = re.match("^net_(.*)$", param);
|
565 |
if m == None:
|
566 |
m = re.match("^(ip)$", param);
|
567 |
if m != None:
|
568 |
net_param = m.group(1);
|
569 |
#this.logger.log(Logger.DEBUG, "Querying param "+net_param);
|
570 |
for key, value in dataHash.items():
|
571 |
m = re.match("eth\d_"+net_param, key);
|
572 |
if m != None:
|
573 |
result[key] = value;
|
574 |
else:
|
575 |
if param == 'processes':
|
576 |
for key in dataHash.keys():
|
577 |
if key.find('processes') == 0:
|
578 |
result[key] = dataHash[key];
|
579 |
elif dataHash.has_key(param):
|
580 |
result[param] = dataHash[param];
|
581 |
sorted_result = [];
|
582 |
keys = result.keys();
|
583 |
keys.sort();
|
584 |
for key in keys:
|
585 |
sorted_result.append((key,result[key]));
|
586 |
return sorted_result;
|
587 |
|
588 |
######################################################################################
|
589 |
# self test
|
590 |
|
591 |
if __name__ == '__main__':
|
592 |
logger = Logger.Logger(Logger.DEBUG);
|
593 |
pi = ProcInfo(logger);
|
594 |
|
595 |
print "first update";
|
596 |
pi.update();
|
597 |
print "Sleeping to accumulate";
|
598 |
time.sleep(1);
|
599 |
pi.update();
|
600 |
|
601 |
print "System Monitoring:";
|
602 |
sys_cpu_params = ['cpu_usr', 'cpu_sys', 'cpu_idle', 'cpu_nice', 'cpu_usage'];
|
603 |
sys_2_4_params = ['pages_in', 'pages_out', 'swap_in', 'swap_out'];
|
604 |
sys_mem_params = ['mem_used', 'mem_free', 'total_mem', 'mem_usage'];
|
605 |
sys_swap_params = ['swap_used', 'swap_free', 'total_swap', 'swap_usage'];
|
606 |
sys_load_params = ['load1', 'load5', 'load15', 'processes', 'uptime'];
|
607 |
sys_gen_params = ['hostname', 'cpu_MHz', 'no_CPUs', 'cpu_vendor_id', 'cpu_family', 'cpu_model', 'cpu_model_name', 'bogomips'];
|
608 |
sys_net_params = ['net_in', 'net_out', 'net_errs', 'ip'];
|
609 |
sys_net_stat = ['sockets_tcp', 'sockets_udp', 'sockets_unix', 'sockets_icm'];
|
610 |
sys_tcp_details = ['sockets_tcp_ESTABLISHED', 'sockets_tcp_SYN_SENT', 'sockets_tcp_SYN_RECV', 'sockets_tcp_FIN_WAIT1', 'sockets_tcp_FIN_WAIT2', 'sockets_tcp_TIME_WAIT', 'sockets_tcp_CLOSED', 'sockets_tcp_CLOSE_WAIT', 'sockets_tcp_LAST_ACK', 'sockets_tcp_LISTEN', 'sockets_tcp_CLOSING', 'sockets_tcp_UNKNOWN'];
|
611 |
|
612 |
print "sys_cpu_params", pi.getSystemData(sys_cpu_params);
|
613 |
print "sys_2_4_params", pi.getSystemData(sys_2_4_params);
|
614 |
print "sys_mem_params", pi.getSystemData(sys_mem_params);
|
615 |
print "sys_swap_params", pi.getSystemData(sys_swap_params);
|
616 |
print "sys_load_params", pi.getSystemData(sys_load_params);
|
617 |
print "sys_gen_params", pi.getSystemData(sys_gen_params);
|
618 |
print "sys_net_params", pi.getSystemData(sys_net_params);
|
619 |
print "sys_net_stat", pi.getSystemData(sys_net_stat);
|
620 |
print "sys_tcp_details", pi.getSystemData(sys_tcp_details);
|
621 |
|
622 |
job_pid = os.getpid();
|
623 |
|
624 |
print "Job (mysefl) monitoring:";
|
625 |
pi.addJobToMonitor(job_pid, os.getcwd());
|
626 |
print "Sleep another second";
|
627 |
time.sleep(1);
|
628 |
pi.update();
|
629 |
|
630 |
job_cpu_params = ['run_time', 'cpu_time', 'cpu_usage'];
|
631 |
job_mem_params = ['mem_usage', 'rss', 'virtualmem', 'open_files'];
|
632 |
job_disk_params = ['workdir_size', 'disk_used', 'disk_free', 'disk_total', 'disk_usage'];
|
633 |
time.sleep(10);
|
634 |
print "job_cpu_params", pi.getJobData(job_pid, job_cpu_params);
|
635 |
print "job_mem_params", pi.getJobData(job_pid, job_mem_params);
|
636 |
print "job_disk_params", pi.getJobData(job_pid, job_disk_params);
|
637 |
|
638 |
pi.removeJobToMonitor(os.getpid());
|