1 |
|
|
2 |
|
""" |
3 |
|
* ApMon - Application Monitoring Tool |
4 |
< |
* Version: 2.0.4 |
4 |
> |
* Version: 2.2.1 |
5 |
|
* |
6 |
< |
* Copyright (C) 2005 California Institute of Technology |
6 |
> |
* Copyright (C) 2006 California Institute of Technology |
7 |
|
* |
8 |
|
* Permission is hereby granted, free of charge, to use, copy and modify |
9 |
|
* this software and its documentation (the "Software") for any |
35 |
|
import os |
36 |
|
import re |
37 |
|
import time |
38 |
+ |
import string |
39 |
|
import socket |
40 |
|
import Logger |
41 |
|
|
47 |
|
# ProcInfo constructor |
48 |
|
def __init__ (this, logger): |
49 |
|
this.DATA = {}; # monitored data that is going to be reported |
49 |
– |
this.OLD_RAW = {}; # helper hashes from which is derived the |
50 |
– |
this.NEW_RAW = {}; # information in DATA for some of the measurements. |
50 |
|
this.LAST_UPDATE_TIME = 0; # when the last measurement was done |
51 |
|
this.JOBS = {}; # jobs that will be monitored |
52 |
|
this.logger = logger # use the given logger |
53 |
+ |
this.OS_TYPE = os.popen('uname -s').readline().replace('\n',''); |
54 |
|
|
55 |
|
# This should be called from time to time to update the monitored data, |
56 |
|
# but not more often than once a second because of the resolution of time() |
60 |
|
return; |
61 |
|
this.readStat(); |
62 |
|
this.readMemInfo(); |
63 |
< |
this.readLoadAvg(); |
63 |
> |
if this.OS_TYPE == 'Darwin': |
64 |
> |
this.darwin_readLoadAvg(); |
65 |
> |
else: |
66 |
> |
this.readLoadAvg(); |
67 |
|
this.countProcesses(); |
68 |
|
this.readGenericInfo(); |
69 |
|
this.readNetworkInfo(); |
70 |
+ |
this.readNetStat(); |
71 |
|
for pid in this.JOBS.keys(): |
72 |
|
this.readJobInfo(pid); |
73 |
|
this.readJobDiskUsage(pid); |
74 |
|
this.LAST_UPDATE_TIME = int(time.time()); |
75 |
< |
|
75 |
> |
this.DATA['TIME'] = int(time.time()); |
76 |
> |
|
77 |
|
# Call this to add another PID to be monitored |
78 |
|
def addJobToMonitor (this, pid, workDir): |
79 |
|
this.JOBS[pid] = {}; |
80 |
|
this.JOBS[pid]['WORKDIR'] = workDir; |
81 |
|
this.JOBS[pid]['DATA'] = {}; |
82 |
< |
print this.JOBS; |
82 |
> |
#print this.JOBS; |
83 |
|
|
84 |
|
# Call this to stop monitoring a PID |
85 |
|
def removeJobToMonitor (this, pid): |
87 |
|
del this.JOBS[pid]; |
88 |
|
|
89 |
|
# Return a filtered hash containting the system-related parameters and values |
90 |
< |
def getSystemData (this, params): |
91 |
< |
return this.getFilteredData(this.DATA, params); |
90 |
> |
def getSystemData (this, params, prevDataRef): |
91 |
> |
return this.getFilteredData(this.DATA, params, prevDataRef); |
92 |
|
|
93 |
|
# Return a filtered hash containing the job-related parameters and values |
94 |
|
def getJobData (this, pid, params): |
103 |
|
# this has to be run twice (with the $lastUpdateTime updated) to get some useful results |
104 |
|
# the information about pages_in/out and swap_in/out isn't available for 2.6 kernels (yet) |
105 |
|
def readStat (this): |
101 |
– |
this.OLD_RAW = this.NEW_RAW.copy(); |
106 |
|
try: |
107 |
|
FSTAT = open('/proc/stat'); |
108 |
|
line = FSTAT.readline(); |
109 |
|
while(line != ''): |
110 |
|
if(line.startswith("cpu ")): |
111 |
|
elem = re.split("\s+", line); |
112 |
< |
this.NEW_RAW['cpu_usr'] = float(elem[1]); |
113 |
< |
this.NEW_RAW['cpu_nice'] = float(elem[2]); |
114 |
< |
this.NEW_RAW['cpu_sys'] = float(elem[3]); |
115 |
< |
this.NEW_RAW['cpu_idle'] = float(elem[4]); |
112 |
> |
this.DATA['raw_cpu_usr'] = float(elem[1]); |
113 |
> |
this.DATA['raw_cpu_nice'] = float(elem[2]); |
114 |
> |
this.DATA['raw_cpu_sys'] = float(elem[3]); |
115 |
> |
this.DATA['raw_cpu_idle'] = float(elem[4]); |
116 |
|
if(line.startswith("page")): |
117 |
|
elem = line.split(); |
118 |
< |
this.NEW_RAW['pages_in'] = float(elem[1]); |
119 |
< |
this.NEW_RAW['pages_out'] = float(elem[2]); |
118 |
> |
this.DATA['raw_pages_in'] = float(elem[1]); |
119 |
> |
this.DATA['raw_pages_out'] = float(elem[2]); |
120 |
|
if(line.startswith('swap')): |
121 |
|
elem = line.split(); |
122 |
< |
this.NEW_RAW['swap_in'] = float(elem[1]); |
123 |
< |
this.NEW_RAW['swap_out'] = float(elem[2]); |
122 |
> |
this.DATA['raw_swap_in'] = float(elem[1]); |
123 |
> |
this.DATA['raw_swap_out'] = float(elem[2]); |
124 |
|
line = FSTAT.readline(); |
125 |
|
FSTAT.close(); |
126 |
|
except IOError, ex: |
127 |
|
this.logger.log(Logger.ERROR, "ProcInfo: cannot open /proc/stat"); |
128 |
|
return; |
129 |
< |
if(len(this.OLD_RAW.keys()) == 0): |
126 |
< |
return; |
127 |
< |
diff = {}; |
128 |
< |
cpu_sum = 0; |
129 |
< |
for key in (this.NEW_RAW.keys()): |
130 |
< |
#this.logger.log(Logger.DEBUG, "key = " + key); |
131 |
< |
if key.startswith('cpu_') or key.startswith('pages_') or key.startswith('swap_'): |
132 |
< |
#this.logger.log(Logger.DEBUG, "old = "+`this.OLD_RAW[key]`+" new = "+`this.NEW_RAW[key]`); |
133 |
< |
diff[key] = this.NEW_RAW[key] - this.OLD_RAW[key]; |
134 |
< |
if key.startswith('cpu_'): |
135 |
< |
#this.logger.log(Logger.DEBUG, "diff=" + `diff[key]`); |
136 |
< |
cpu_sum += diff[key]; |
137 |
< |
for key in ('cpu_usr', 'cpu_nice', 'cpu_sys', 'cpu_idle'): |
138 |
< |
this.DATA[key] = 100.0 * diff[key] / cpu_sum; |
139 |
< |
this.DATA['cpu_usage'] = 100.0 * (diff['cpu_usr'] + diff['cpu_sys'] + diff['cpu_nice']) / cpu_sum; |
140 |
< |
|
141 |
< |
if this.NEW_RAW.has_key('pages_in'): |
142 |
< |
now = int(time.time()); |
143 |
< |
for key in ('pages_in', 'pages_out', 'swap_in', 'swap_out'): |
144 |
< |
this.DATA[key] = diff[key] / (now - this.LAST_UPDATE_TIME); |
145 |
< |
|
129 |
> |
|
130 |
|
# sizes are reported in MB (except _usage that is in percent). |
131 |
|
def readMemInfo (this): |
132 |
|
try: |
144 |
|
this.DATA['total_swap'] = float(elem[1]) / 1024.0; |
145 |
|
line = FMEM.readline(); |
146 |
|
FMEM.close(); |
147 |
< |
this.DATA['mem_used'] = this.DATA['total_mem'] - this.DATA['mem_free']; |
148 |
< |
this.DATA['swap_used'] = this.DATA['total_swap'] - this.DATA['swap_free']; |
149 |
< |
this.DATA['mem_usage'] = 100.0 * this.DATA['mem_used'] / this.DATA['total_mem']; |
150 |
< |
this.DATA['swap_usage'] = 100.0 * this.DATA['swap_used'] / this.DATA['total_swap']; |
147 |
> |
if this.DATA.has_key('total_mem') and this.DATA.has_key('mem_free'): |
148 |
> |
this.DATA['mem_used'] = this.DATA['total_mem'] - this.DATA['mem_free']; |
149 |
> |
if this.DATA.has_key('total_swap') and this.DATA.has_key('swap_free'): |
150 |
> |
this.DATA['swap_used'] = this.DATA['total_swap'] - this.DATA['swap_free']; |
151 |
> |
if this.DATA.has_key('mem_used') and this.DATA.has_key('total_mem') and this.DATA['total_mem'] > 0: |
152 |
> |
this.DATA['mem_usage'] = 100.0 * this.DATA['mem_used'] / this.DATA['total_mem']; |
153 |
> |
if this.DATA.has_key('swap_used') and this.DATA.has_key('total_swap') and this.DATA['total_swap'] > 0: |
154 |
> |
this.DATA['swap_usage'] = 100.0 * this.DATA['swap_used'] / this.DATA['total_swap']; |
155 |
|
except IOError, ex: |
156 |
|
this.logger.log(Logger.ERROR, "ProcInfo: cannot open /proc/meminfo"); |
157 |
|
return; |
170 |
|
this.logger.log(Logger.ERROR, "ProcInfo: cannot open /proc/meminfo"); |
171 |
|
return; |
172 |
|
|
173 |
+ |
|
174 |
+ |
# read system load average on Darwin |
175 |
+ |
def darwin_readLoadAvg (this): |
176 |
+ |
try: |
177 |
+ |
LOAD_AVG = os.popen('sysctl vm.loadavg'); |
178 |
+ |
line = LOAD_AVG.readline(); |
179 |
+ |
LOAD_AVG.close(); |
180 |
+ |
elem = re.split("\s+", line); |
181 |
+ |
this.DATA['load1'] = float(elem[1]); |
182 |
+ |
this.DATA['load5'] = float(elem[2]); |
183 |
+ |
this.DATA['load15'] = float(elem[3]); |
184 |
+ |
except IOError, ex: |
185 |
+ |
this.logger.log(Logger.ERROR, "ProcInfo: cannot run 'sysctl vm.loadavg"); |
186 |
+ |
return; |
187 |
+ |
|
188 |
+ |
|
189 |
|
# read the number of processes currently running on the system |
190 |
|
def countProcesses (this): |
191 |
+ |
""" |
192 |
+ |
# old version |
193 |
|
nr = 0; |
194 |
|
try: |
195 |
|
for file in os.listdir("/proc"): |
199 |
|
except IOError, ex: |
200 |
|
this.logger.log(Logger.ERROR, "ProcInfo: cannot open /proc to count processes"); |
201 |
|
return; |
202 |
< |
|
202 |
> |
""" |
203 |
> |
# new version |
204 |
> |
total = 0; |
205 |
> |
states = {'D':0, 'R':0, 'S':0, 'T':0, 'Z':0}; |
206 |
> |
try: |
207 |
> |
output = os.popen('ps -A -o state'); |
208 |
> |
line = output.readline(); |
209 |
> |
while(line != ''): |
210 |
> |
states[line[0]] = states[line[0]] + 1; |
211 |
> |
total = total + 1; |
212 |
> |
line = output.readline(); |
213 |
> |
output.close(); |
214 |
> |
this.DATA['processes'] = total; |
215 |
> |
for key in states.keys(): |
216 |
> |
this.DATA['processes_'+key] = states[key]; |
217 |
> |
except IOError, ex: |
218 |
> |
this.logger.log(Logger.ERROR, "ProcInfo: cannot get output from ps command"); |
219 |
> |
return; |
220 |
> |
|
221 |
|
# reads the IP, hostname, cpu_MHz, uptime |
222 |
|
def readGenericInfo (this): |
223 |
|
this.DATA['hostname'] = socket.getfqdn(); |
248 |
|
if line.startswith("cpu MHz"): |
249 |
|
this.DATA['cpu_MHz'] = float(re.match("cpu MHz\s+:\s+(\d+\.?\d*)", line).group(1)); |
250 |
|
no_cpus += 1; |
251 |
+ |
|
252 |
+ |
if line.startswith("vendor_id"): |
253 |
+ |
this.DATA['cpu_vendor_id'] = re.match("vendor_id\s+:\s+(.+)", line).group(1); |
254 |
+ |
|
255 |
+ |
if line.startswith("cpu family"): |
256 |
+ |
this.DATA['cpu_family'] = re.match("cpu family\s+:\s+(.+)", line).group(1); |
257 |
+ |
|
258 |
+ |
if line.startswith("model") and not line.startswith("model name") : |
259 |
+ |
this.DATA['cpu_model'] = re.match("model\s+:\s+(.+)", line).group(1); |
260 |
+ |
|
261 |
+ |
if line.startswith("model name"): |
262 |
+ |
this.DATA['cpu_model_name'] = re.match("model name\s+:\s+(.+)", line).group(1); |
263 |
+ |
|
264 |
+ |
if line.startswith("bogomips"): |
265 |
+ |
this.DATA['bogomips'] = float(re.match("bogomips\s+:\s+(\d+\.?\d*)", line).group(1)); |
266 |
+ |
|
267 |
|
line = FCPU.readline(); |
268 |
|
FCPU.close(); |
269 |
|
this.DATA['no_CPUs'] = no_cpus; |
280 |
|
this.logger.log(Logger.ERROR, "ProcInfo: cannot open /proc/uptime"); |
281 |
|
return; |
282 |
|
|
283 |
+ |
# do a difference with overflow check and repair |
284 |
+ |
# the counter is unsigned 32 or 64 bit |
285 |
+ |
def diffWithOverflowCheck(this, new, old): |
286 |
+ |
if new >= old: |
287 |
+ |
return new - old; |
288 |
+ |
else: |
289 |
+ |
max = (1L << 31) * 2; # 32 bits |
290 |
+ |
if old >= max: |
291 |
+ |
max = (1L << 63) * 2; # 64 bits |
292 |
+ |
return new - old + max; |
293 |
+ |
|
294 |
|
# read network information like transfered kBps and nr. of errors on each interface |
295 |
|
def readNetworkInfo (this): |
245 |
– |
this.OLD_RAW = this.NEW_RAW; |
246 |
– |
interval = int(time.time()) - this.LAST_UPDATE_TIME; |
296 |
|
try: |
297 |
|
FNET = open('/proc/net/dev'); |
298 |
|
line = FNET.readline(); |
299 |
|
while(line != ''): |
300 |
|
m = re.match("\s*eth(\d):(\d+)\s+\d+\s+(\d+)\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+(\d+)\s+\d+\s+(\d+)", line); |
301 |
|
if m != None: |
302 |
< |
this.NEW_RAW['eth'+m.group(1)+'_in'] = float(m.group(2)); |
303 |
< |
this.NEW_RAW['eth'+m.group(1)+'_out'] = float(m.group(4)); |
304 |
< |
this.DATA['eth'+m.group(1)+'_errs'] = int(m.group(3)) + int(m.group(5)); |
256 |
< |
if(this.OLD_RAW.has_key('eth'+m.group(1)+"_in")): |
257 |
< |
#this.logger.log(Logger.DEBUG, "Net I/O eth"+m.group(1)+" interval = "+ `interval`); |
258 |
< |
Bps = (this.NEW_RAW['eth'+m.group(1)+'_in'] - this.OLD_RAW['eth'+m.group(1)+'_in']) / interval; |
259 |
< |
this.DATA['eth'+m.group(1)+'_in'] = Bps / 1024.0; |
260 |
< |
Bps = (this.NEW_RAW['eth'+m.group(1)+'_out'] - this.OLD_RAW['eth'+m.group(1)+'_out']) / interval; |
261 |
< |
this.DATA['eth'+m.group(1)+'_out'] = Bps / 1024.0; |
302 |
> |
this.DATA['raw_eth'+m.group(1)+'_in'] = float(m.group(2)); |
303 |
> |
this.DATA['raw_eth'+m.group(1)+'_out'] = float(m.group(4)); |
304 |
> |
this.DATA['raw_eth'+m.group(1)+'_errs'] = int(m.group(3)) + int(m.group(5)); |
305 |
|
line = FNET.readline(); |
306 |
|
FNET.close(); |
307 |
|
except IOError, ex: |
308 |
|
this.logger.log(Logger.ERROR, "ProcInfo: cannot open /proc/net/dev"); |
309 |
|
return; |
310 |
< |
|
310 |
> |
|
311 |
> |
# run nestat and collect sockets info (tcp, udp, unix) and connection states for tcp sockets from netstat |
312 |
> |
def readNetStat(this): |
313 |
> |
try: |
314 |
> |
output = os.popen('netstat -an'); |
315 |
> |
sockets = { 'sockets_tcp':0, 'sockets_udp':0, 'sockets_unix':0, 'sockets_icm':0 }; |
316 |
> |
tcp_details = { 'sockets_tcp_ESTABLISHED':0, 'sockets_tcp_SYN_SENT':0, |
317 |
> |
'sockets_tcp_SYN_RECV':0, 'sockets_tcp_FIN_WAIT1':0, 'sockets_tcp_FIN_WAIT2':0, |
318 |
> |
'sockets_tcp_TIME_WAIT':0, 'sockets_tcp_CLOSED':0, 'sockets_tcp_CLOSE_WAIT':0, |
319 |
> |
'sockets_tcp_LAST_ACK':0, 'sockets_tcp_LISTEN':0, 'sockets_tcp_CLOSING':0, |
320 |
> |
'sockets_tcp_UNKNOWN':0 }; |
321 |
> |
line = output.readline(); |
322 |
> |
while(line != ''): |
323 |
> |
arg = string.split(line); |
324 |
> |
proto = arg[0]; |
325 |
> |
if proto.find('tcp') == 0: |
326 |
> |
sockets['sockets_tcp'] += 1; |
327 |
> |
state = arg[len(arg)-1]; |
328 |
> |
key = 'sockets_tcp_'+state; |
329 |
> |
if tcp_details.has_key(key): |
330 |
> |
tcp_details[key] += 1; |
331 |
> |
if proto.find('udp') == 0: |
332 |
> |
sockets['sockets_udp'] += 1; |
333 |
> |
if proto.find('unix') == 0: |
334 |
> |
sockets['sockets_unix'] += 1; |
335 |
> |
if proto.find('icm') == 0: |
336 |
> |
sockets['sockets_icm'] += 1; |
337 |
> |
|
338 |
> |
line = output.readline(); |
339 |
> |
output.close(); |
340 |
> |
|
341 |
> |
for key in sockets.keys(): |
342 |
> |
this.DATA[key] = sockets[key]; |
343 |
> |
for key in tcp_details.keys(): |
344 |
> |
this.DATA[key] = tcp_details[key]; |
345 |
> |
except IOError, ex: |
346 |
> |
this.logger.log(Logger.ERROR, "ProcInfo: cannot get output from netstat command"); |
347 |
> |
return; |
348 |
> |
|
349 |
|
############################################################################################## |
350 |
|
# job monitoring related functions |
351 |
|
############################################################################################## |
354 |
|
def getChildren (this, parent): |
355 |
|
pidmap = {}; |
356 |
|
try: |
357 |
< |
output = os.popen('ps --no-headers -eo "pid ppid"'); |
357 |
> |
output = os.popen('ps -A -o "pid ppid"'); |
358 |
> |
line = output.readline(); # skip headers |
359 |
|
line = output.readline(); |
360 |
|
while(line != ''): |
361 |
|
line = line.strip(); |
364 |
|
line = output.readline(); |
365 |
|
output.close(); |
366 |
|
except IOError, ex: |
367 |
< |
this.logger.log(Logger.ERROR, "ProcInfo: cannot execute ps --no-headers -eo \"pid ppid\""); |
367 |
> |
this.logger.log(Logger.ERROR, "ProcInfo: cannot execute ps -A -o \"pid ppid\""); |
368 |
|
|
369 |
< |
if not pidmap.has_key(parent): |
369 |
> |
if pidmap.has_key(parent): |
370 |
|
this.logger.log(Logger.INFO, 'ProcInfo: No job with pid='+str(parent)); |
371 |
|
this.removeJobToMonitor(parent); |
372 |
|
return []; |
378 |
|
for (pid, ppid) in pidmap.items(): |
379 |
|
if ppid == prnt: |
380 |
|
children.append(pid); |
381 |
< |
i += 1; |
381 |
> |
i += 1; |
382 |
|
return children; |
383 |
|
|
384 |
|
# internal function that parses a time formatted like "days-hours:min:sec" and returns the corresponding |
385 |
|
# number of seconds. |
386 |
|
def parsePSTime (this, my_time): |
387 |
|
my_time = my_time.strip(); |
388 |
+ |
m = re.match("(\d+)-(\d+):(\d+):(\d+)", my_time); |
389 |
|
if m != None: |
307 |
– |
m = re.match("(\d+)-(\d+):(\d+):(\d+)", my_time); |
390 |
|
return int(m.group(1)) * 24 * 3600 + int(m.group(2)) * 3600 + int(m.group(3)) * 60 + int(m.group(4)); |
391 |
|
else: |
392 |
|
m = re.match("(\d+):(\d+):(\d+)", my_time); |
402 |
|
# read information about this the JOB_PID process |
403 |
|
# memory sizes are given in KB |
404 |
|
def readJobInfo (this, pid): |
405 |
< |
if (pid == '') or this.JOBS.has_key(pid): |
405 |
> |
if (pid == '') or not this.JOBS.has_key(pid): |
406 |
|
return; |
407 |
|
children = this.getChildren(pid); |
408 |
|
if(len(children) == 0): |
409 |
|
this.logger.log(Logger.INFO, "ProcInfo: Job with pid="+str(pid)+" terminated; removing it from monitored jobs."); |
410 |
+ |
#print ":(" |
411 |
|
this.removeJobToMonitor(pid); |
412 |
|
return; |
413 |
|
try: |
414 |
< |
JSTATUS = os.popen("ps --no-headers --pid " + ",".join([`child` for child in children]) + " -o etime,time,%cpu,%mem,rsz,vsz,comm"); |
414 |
> |
JSTATUS = os.popen("ps --no-headers --pid " + ",".join([`child` for child in children]) + " -o pid,etime,time,%cpu,%mem,rsz,vsz,comm"); |
415 |
|
mem_cmd_map = {}; |
416 |
< |
etime, cputime, pcpu, pmem, rsz, vsz, comm = 0, 0, 0, 0, 0, 0, 0; |
416 |
> |
etime, cputime, pcpu, pmem, rsz, vsz, comm, fd = 0, 0, 0, 0, 0, 0, 0, 0; |
417 |
|
line = JSTATUS.readline(); |
418 |
|
while(line != ''): |
419 |
|
line = line.strip(); |
420 |
< |
m = re.match("(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(.+)", line); |
420 |
> |
m = re.match("(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(.+)", line); |
421 |
|
if m != None: |
422 |
< |
etime1, cputime1, pcpu1, pmem1, rsz1, vsz1, comm1 = m.group(1), m.group(2), m.group(3), m.group(4), m.group(5), m.group(6), m.group(7); |
422 |
> |
apid, etime1, cputime1, pcpu1, pmem1, rsz1, vsz1, comm1 = m.group(1), m.group(2), m.group(3), m.group(4), m.group(5), m.group(6), m.group(7), m.group(8); |
423 |
|
sec = this.parsePSTime(etime1); |
424 |
|
if sec > etime: # the elapsed time is the maximum of all elapsed |
425 |
|
etime = sec; |
430 |
|
# it's the first thread/process with this memory footprint; add it. |
431 |
|
mem_cmd_map[`pmem1`+" "+`rsz1`+" "+`vsz1`+" "+`comm1`] = 1; |
432 |
|
pmem += float(pmem1); rsz += int(rsz1); vsz += int(vsz1); |
433 |
+ |
fd += this.countOpenFD(apid); |
434 |
|
# else not adding memory usage |
435 |
|
line = JSTATUS.readline(); |
436 |
|
JSTATUS.close(); |
440 |
|
this.JOBS[pid]['DATA']['mem_usage'] = pmem; |
441 |
|
this.JOBS[pid]['DATA']['rss'] = rsz; |
442 |
|
this.JOBS[pid]['DATA']['virtualmem'] = vsz; |
443 |
+ |
this.JOBS[pid]['DATA']['open_files'] = fd; |
444 |
|
except IOError, ex: |
445 |
|
this.logger.log(Logger.ERROR, "ProcInfo: cannot execute ps --no-headers -eo \"pid ppid\""); |
446 |
< |
|
446 |
> |
|
447 |
> |
# count the number of open files for the given pid |
448 |
> |
def countOpenFD (this, pid): |
449 |
> |
dir = '/proc/'+str(pid)+'/fd'; |
450 |
> |
if os.access(dir, os.F_OK): |
451 |
> |
if os.access(dir, os.X_OK): |
452 |
> |
list = os.listdir(dir); |
453 |
> |
open_files = len(list); |
454 |
> |
if pid == os.getpid(): |
455 |
> |
open_files -= 2; |
456 |
> |
this.logger.log(Logger.DEBUG, "Counting open_files for "+ `pid` +": "+ str(len(list)) +" => " + `open_files` + " open_files"); |
457 |
> |
return open_files; |
458 |
> |
else: |
459 |
> |
this.logger.log(Logger.ERROR, "ProcInfo: cannot count the number of opened files for job "+`pid`); |
460 |
> |
else: |
461 |
> |
this.logger.log(Logger.ERROR, "ProcInfo: job "+`pid`+" dosen't exist"); |
462 |
> |
|
463 |
> |
|
464 |
|
# if there is an work directory defined, then compute the used space in that directory |
465 |
|
# and the free disk space on the partition to which that directory belongs |
466 |
|
# sizes are given in MB |
467 |
|
def readJobDiskUsage (this, pid): |
468 |
< |
if (pid == '') or this.JOBS.has_key(pid): |
468 |
> |
if (pid == '') or not this.JOBS.has_key(pid): |
469 |
|
return; |
470 |
|
workDir = this.JOBS[pid]['WORKDIR']; |
471 |
|
if workDir == '': |
472 |
|
return; |
473 |
|
try: |
474 |
< |
DU = os.popen("du -Lscm "+workDir+" | tail -1 | cut -f 1"); |
474 |
> |
DU = os.popen("du -Lsck " + workDir + " | tail -1 | cut -f 1"); |
475 |
|
line = DU.readline(); |
476 |
< |
this.JOBS[pid]['DATA']['workdir_size'] = int(line); |
476 |
> |
this.JOBS[pid]['DATA']['workdir_size'] = int(line) / 1024.0; |
477 |
|
except IOError, ex: |
478 |
|
this.logger.log(Logger.ERROR, "ERROR", "ProcInfo: cannot run du to get job's disk usage for job "+`pid`); |
479 |
|
try: |
480 |
< |
DF = os.popen("df -m "+workDir+" | tail -1"); |
480 |
> |
DF = os.popen("df -k "+workDir+" | tail -1"); |
481 |
|
line = DF.readline().strip(); |
482 |
|
m = re.match("\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)%", line); |
483 |
|
if m != None: |
484 |
< |
this.JOBS[pid]['DATA']['disk_total'] = m.group(1); |
485 |
< |
this.JOBS[pid]['DATA']['disk_used'] = m.group(2); |
486 |
< |
this.JOBS[pid]['DATA']['disk_free'] = m.group(3); |
487 |
< |
this.JOBS[pid]['DATA']['disk_usage'] = m.group(4); |
484 |
> |
this.JOBS[pid]['DATA']['disk_total'] = float(m.group(1)) / 1024.0; |
485 |
> |
this.JOBS[pid]['DATA']['disk_used'] = float(m.group(2)) / 1024.0; |
486 |
> |
this.JOBS[pid]['DATA']['disk_free'] = float(m.group(3)) / 1024.0; |
487 |
> |
this.JOBS[pid]['DATA']['disk_usage'] = float(m.group(4)) / 1024.0; |
488 |
|
DF.close(); |
489 |
|
except IOError, ex: |
490 |
|
this.logger.log(Logger.ERROR, "ERROR", "ProcInfo: cannot run df to get job's disk usage for job "+`pid`); |
491 |
|
|
492 |
+ |
# create cummulative parameters based on raw params like cpu_, pages_, swap_, or ethX_ |
493 |
+ |
def computeCummulativeParams(this, dataRef, prevDataRef): |
494 |
+ |
if prevDataRef == {}: |
495 |
+ |
for key in dataRef.keys(): |
496 |
+ |
if key.find('raw_') == 0: |
497 |
+ |
prevDataRef[key] = dataRef[key]; |
498 |
+ |
prevDataRef['TIME'] = dataRef['TIME']; |
499 |
+ |
return; |
500 |
+ |
|
501 |
+ |
# cpu -related params |
502 |
+ |
if (dataRef.has_key('raw_cpu_usr')) and (prevDataRef.has_key('raw_cpu_usr')): |
503 |
+ |
diff={}; |
504 |
+ |
cpu_sum = 0; |
505 |
+ |
for param in ['cpu_usr', 'cpu_nice', 'cpu_sys', 'cpu_idle']: |
506 |
+ |
diff[param] = this.diffWithOverflowCheck(dataRef['raw_'+param], prevDataRef['raw_'+param]); |
507 |
+ |
cpu_sum += diff[param]; |
508 |
+ |
for param in ['cpu_usr', 'cpu_nice', 'cpu_sys', 'cpu_idle']: |
509 |
+ |
if cpu_sum != 0: |
510 |
+ |
dataRef[param] = 100.0 * diff[param] / cpu_sum; |
511 |
+ |
else: |
512 |
+ |
del dataRef[param]; |
513 |
+ |
if cpu_sum != 0: |
514 |
+ |
dataRef['cpu_usage'] = 100.0 * (cpu_sum - diff['cpu_idle']) / cpu_sum; |
515 |
+ |
else: |
516 |
+ |
del dataRef['cpu_usage']; |
517 |
+ |
|
518 |
+ |
# swap & pages -related params |
519 |
+ |
if (dataRef.has_key('raw_pages_in')) and (prevDataRef.has_key('raw_pages_in')): |
520 |
+ |
interval = dataRef['TIME'] - prevDataRef['TIME']; |
521 |
+ |
for param in ['pages_in', 'pages_out', 'swap_in', 'swap_out']: |
522 |
+ |
diff = this.diffWithOverflowCheck(dataRef['raw_'+param], prevDataRef['raw_'+param]); |
523 |
+ |
if interval != 0: |
524 |
+ |
dataRef[param] = 1000.0 * diff / interval; |
525 |
+ |
else: |
526 |
+ |
del dataRef[param]; |
527 |
+ |
|
528 |
+ |
# eth - related params |
529 |
+ |
interval = dataRef['TIME'] - prevDataRef['TIME']; |
530 |
+ |
for rawParam in dataRef.keys(): |
531 |
+ |
if (rawParam.find('raw_eth') == 0) and prevDataRef.has_key(rawParam): |
532 |
+ |
param = rawParam.split('raw_')[1]; |
533 |
+ |
if interval != 0: |
534 |
+ |
dataRef[param] = this.diffWithOverflowCheck(dataRef[rawParam], prevDataRef[rawParam]); # absolute difference |
535 |
+ |
if param.find('_errs') == -1: |
536 |
+ |
dataRef[param] = dataRef[param] / interval / 1024.0; # if it's _in or _out, compute in KB/sec |
537 |
+ |
else: |
538 |
+ |
del dataRef[param]; |
539 |
+ |
|
540 |
+ |
# copy contents of the current data values to the |
541 |
+ |
for param in dataRef.keys(): |
542 |
+ |
if param.find('raw_') == 0: |
543 |
+ |
prevDataRef[param] = dataRef[param]; |
544 |
+ |
prevDataRef['TIME'] = dataRef['TIME']; |
545 |
+ |
|
546 |
+ |
|
547 |
|
# Return a hash containing (param,value) pairs with existing values from the requested ones |
548 |
< |
def getFilteredData (this, dataHash, paramsList): |
548 |
> |
def getFilteredData (this, dataHash, paramsList, prevDataHash = None): |
549 |
> |
|
550 |
> |
if not prevDataHash is None: |
551 |
> |
this.computeCummulativeParams(dataHash, prevDataHash); |
552 |
> |
|
553 |
|
result = {}; |
554 |
|
for param in paramsList: |
555 |
+ |
if param == 'net_sockets': |
556 |
+ |
for key in dataHash.keys(): |
557 |
+ |
if key.find('sockets') == 0 and key.find('sockets_tcp_') == -1: |
558 |
+ |
result[key] = dataHash[key]; |
559 |
+ |
elif param == 'net_tcp_details': |
560 |
+ |
for key in dataHash.keys(): |
561 |
+ |
if key.find('sockets_tcp_') == 0: |
562 |
+ |
result[key] = dataHash[key]; |
563 |
+ |
|
564 |
|
m = re.match("^net_(.*)$", param); |
565 |
|
if m == None: |
566 |
|
m = re.match("^(ip)$", param); |
572 |
|
if m != None: |
573 |
|
result[key] = value; |
574 |
|
else: |
575 |
< |
if dataHash.has_key(param): |
575 |
> |
if param == 'processes': |
576 |
> |
for key in dataHash.keys(): |
577 |
> |
if key.find('processes') == 0: |
578 |
> |
result[key] = dataHash[key]; |
579 |
> |
elif dataHash.has_key(param): |
580 |
|
result[param] = dataHash[param]; |
581 |
< |
return result; |
581 |
> |
sorted_result = []; |
582 |
> |
keys = result.keys(); |
583 |
> |
keys.sort(); |
584 |
> |
for key in keys: |
585 |
> |
sorted_result.append((key,result[key])); |
586 |
> |
return sorted_result; |
587 |
|
|
588 |
|
###################################################################################### |
589 |
|
# self test |
590 |
|
|
591 |
|
if __name__ == '__main__': |
592 |
< |
logger = this.logger(Logger.DEBUG); |
592 |
> |
logger = Logger.Logger(Logger.DEBUG); |
593 |
|
pi = ProcInfo(logger); |
594 |
+ |
|
595 |
|
print "first update"; |
596 |
|
pi.update(); |
597 |
|
print "Sleeping to accumulate"; |
598 |
|
time.sleep(1); |
599 |
|
pi.update(); |
420 |
– |
print "System Monitoring:"; |
600 |
|
|
601 |
+ |
print "System Monitoring:"; |
602 |
|
sys_cpu_params = ['cpu_usr', 'cpu_sys', 'cpu_idle', 'cpu_nice', 'cpu_usage']; |
603 |
|
sys_2_4_params = ['pages_in', 'pages_out', 'swap_in', 'swap_out']; |
604 |
|
sys_mem_params = ['mem_used', 'mem_free', 'total_mem', 'mem_usage']; |
605 |
|
sys_swap_params = ['swap_used', 'swap_free', 'total_swap', 'swap_usage']; |
606 |
|
sys_load_params = ['load1', 'load5', 'load15', 'processes', 'uptime']; |
607 |
< |
sys_gen_params = ['hostname', 'cpu_MHz', 'no_CPUs']; |
607 |
> |
sys_gen_params = ['hostname', 'cpu_MHz', 'no_CPUs', 'cpu_vendor_id', 'cpu_family', 'cpu_model', 'cpu_model_name', 'bogomips']; |
608 |
|
sys_net_params = ['net_in', 'net_out', 'net_errs', 'ip']; |
609 |
+ |
sys_net_stat = ['sockets_tcp', 'sockets_udp', 'sockets_unix', 'sockets_icm']; |
610 |
+ |
sys_tcp_details = ['sockets_tcp_ESTABLISHED', 'sockets_tcp_SYN_SENT', 'sockets_tcp_SYN_RECV', 'sockets_tcp_FIN_WAIT1', 'sockets_tcp_FIN_WAIT2', 'sockets_tcp_TIME_WAIT', 'sockets_tcp_CLOSED', 'sockets_tcp_CLOSE_WAIT', 'sockets_tcp_LAST_ACK', 'sockets_tcp_LISTEN', 'sockets_tcp_CLOSING', 'sockets_tcp_UNKNOWN']; |
611 |
|
|
612 |
|
print "sys_cpu_params", pi.getSystemData(sys_cpu_params); |
613 |
|
print "sys_2_4_params", pi.getSystemData(sys_2_4_params); |
616 |
|
print "sys_load_params", pi.getSystemData(sys_load_params); |
617 |
|
print "sys_gen_params", pi.getSystemData(sys_gen_params); |
618 |
|
print "sys_net_params", pi.getSystemData(sys_net_params); |
619 |
+ |
print "sys_net_stat", pi.getSystemData(sys_net_stat); |
620 |
+ |
print "sys_tcp_details", pi.getSystemData(sys_tcp_details); |
621 |
+ |
|
622 |
+ |
job_pid = os.getpid(); |
623 |
|
|
624 |
|
print "Job (mysefl) monitoring:"; |
625 |
< |
pi.addJobToMonitor(os.getpid(), os.getcwd()); |
625 |
> |
pi.addJobToMonitor(job_pid, os.getcwd()); |
626 |
|
print "Sleep another second"; |
627 |
|
time.sleep(1); |
628 |
|
pi.update(); |
629 |
|
|
630 |
|
job_cpu_params = ['run_time', 'cpu_time', 'cpu_usage']; |
631 |
< |
job_mem_params = ['mem_usage', 'rss', 'virtualmem']; |
631 |
> |
job_mem_params = ['mem_usage', 'rss', 'virtualmem', 'open_files']; |
632 |
|
job_disk_params = ['workdir_size', 'disk_used', 'disk_free', 'disk_total', 'disk_usage']; |
633 |
+ |
time.sleep(10); |
634 |
+ |
print "job_cpu_params", pi.getJobData(job_pid, job_cpu_params); |
635 |
+ |
print "job_mem_params", pi.getJobData(job_pid, job_mem_params); |
636 |
+ |
print "job_disk_params", pi.getJobData(job_pid, job_disk_params); |
637 |
|
|
448 |
– |
print "job_cpu_params", pi.getJobData(os.getpid(), job_cpu_params); |
449 |
– |
print "job_mem_params", pi.getJobData(os.getpid(), job_mem_params); |
450 |
– |
print "job_disk_params", pi.getJobData(os.getpid(), job_disk_params); |
451 |
– |
|
638 |
|
pi.removeJobToMonitor(os.getpid()); |
453 |
– |
|