1 |
|
2 |
"""
|
3 |
* ApMon - Application Monitoring Tool
|
4 |
* Version: 2.0.4
|
5 |
*
|
6 |
* Copyright (C) 2005 California Institute of Technology
|
7 |
*
|
8 |
* Permission is hereby granted, free of charge, to use, copy and modify
|
9 |
* this software and its documentation (the "Software") for any
|
10 |
* purpose, provided that existing copyright notices are retained in
|
11 |
* all copies and that this notice is included verbatim in any distributions
|
12 |
* or substantial portions of the Software.
|
13 |
* This software is a part of the MonALISA framework (http://monalisa.cacr.caltech.edu).
|
14 |
* Users of the Software are asked to feed back problems, benefits,
|
15 |
* and/or suggestions about the software to the MonALISA Development Team
|
16 |
* (developers@monalisa.cern.ch). Support for this software - fixing of bugs,
|
17 |
* incorporation of new features - is done on a best effort basis. All bug
|
18 |
* fixes and enhancements will be made available under the same terms and
|
19 |
* conditions as the original software,
|
20 |
|
21 |
* IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY FOR
|
22 |
* DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT
|
23 |
* OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY DERIVATIVES THEREOF,
|
24 |
* EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
25 |
|
26 |
* THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
|
27 |
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
|
28 |
* FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE IS
|
29 |
* PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE NO
|
30 |
* OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
|
31 |
* MODIFICATIONS.
|
32 |
"""
|
33 |
|
34 |
|
35 |
import os
|
36 |
import re
|
37 |
import time
|
38 |
import socket
|
39 |
import Logger
|
40 |
|
41 |
"""
|
42 |
Class ProcInfo
|
43 |
extracts information from the proc/ filesystem for system and job monitoring
|
44 |
"""
|
45 |
class ProcInfo:
|
46 |
# ProcInfo constructor
|
47 |
def __init__ (this, logger):
|
48 |
this.DATA = {}; # monitored data that is going to be reported
|
49 |
this.OLD_RAW = {}; # helper hashes from which is derived the
|
50 |
this.NEW_RAW = {}; # information in DATA for some of the measurements.
|
51 |
this.LAST_UPDATE_TIME = 0; # when the last measurement was done
|
52 |
this.JOBS = {}; # jobs that will be monitored
|
53 |
this.logger = logger # use the given logger
|
54 |
|
55 |
# This should be called from time to time to update the monitored data,
|
56 |
# but not more often than once a second because of the resolution of time()
|
57 |
def update (this):
|
58 |
if this.LAST_UPDATE_TIME == int(time.time()):
|
59 |
this.logger.log(Logger.NOTICE, "ProcInfo: update() called too often!");
|
60 |
return;
|
61 |
this.readStat();
|
62 |
this.readMemInfo();
|
63 |
this.readLoadAvg();
|
64 |
this.countProcesses();
|
65 |
this.readGenericInfo();
|
66 |
this.readNetworkInfo();
|
67 |
for pid in this.JOBS.keys():
|
68 |
this.readJobInfo(pid);
|
69 |
this.readJobDiskUsage(pid);
|
70 |
this.LAST_UPDATE_TIME = int(time.time());
|
71 |
|
72 |
# Call this to add another PID to be monitored
|
73 |
def addJobToMonitor (this, pid, workDir):
|
74 |
this.JOBS[pid] = {};
|
75 |
this.JOBS[pid]['WORKDIR'] = workDir;
|
76 |
this.JOBS[pid]['DATA'] = {};
|
77 |
#print this.JOBS;
|
78 |
|
79 |
# Call this to stop monitoring a PID
|
80 |
def removeJobToMonitor (this, pid):
|
81 |
if this.JOBS.has_key(pid):
|
82 |
del this.JOBS[pid];
|
83 |
|
84 |
# Return a filtered hash containting the system-related parameters and values
|
85 |
def getSystemData (this, params):
|
86 |
return this.getFilteredData(this.DATA, params);
|
87 |
|
88 |
# Return a filtered hash containing the job-related parameters and values
|
89 |
def getJobData (this, pid, params):
|
90 |
if not this.JOBS.has_key(pid):
|
91 |
return [];
|
92 |
return this.getFilteredData(this.JOBS[pid]['DATA'], params);
|
93 |
|
94 |
############################################################################################
|
95 |
# internal functions for system monitoring
|
96 |
############################################################################################
|
97 |
|
98 |
# this has to be run twice (with the $lastUpdateTime updated) to get some useful results
|
99 |
# the information about pages_in/out and swap_in/out isn't available for 2.6 kernels (yet)
|
100 |
def readStat (this):
|
101 |
this.OLD_RAW = this.NEW_RAW.copy();
|
102 |
try:
|
103 |
FSTAT = open('/proc/stat');
|
104 |
line = FSTAT.readline();
|
105 |
while(line != ''):
|
106 |
if(line.startswith("cpu ")):
|
107 |
elem = re.split("\s+", line);
|
108 |
this.NEW_RAW['cpu_usr'] = float(elem[1]);
|
109 |
this.NEW_RAW['cpu_nice'] = float(elem[2]);
|
110 |
this.NEW_RAW['cpu_sys'] = float(elem[3]);
|
111 |
this.NEW_RAW['cpu_idle'] = float(elem[4]);
|
112 |
if(line.startswith("page")):
|
113 |
elem = line.split();
|
114 |
this.NEW_RAW['pages_in'] = float(elem[1]);
|
115 |
this.NEW_RAW['pages_out'] = float(elem[2]);
|
116 |
if(line.startswith('swap')):
|
117 |
elem = line.split();
|
118 |
this.NEW_RAW['swap_in'] = float(elem[1]);
|
119 |
this.NEW_RAW['swap_out'] = float(elem[2]);
|
120 |
line = FSTAT.readline();
|
121 |
FSTAT.close();
|
122 |
except IOError, ex:
|
123 |
this.logger.log(Logger.ERROR, "ProcInfo: cannot open /proc/stat");
|
124 |
return;
|
125 |
if(len(this.OLD_RAW.keys()) == 0):
|
126 |
return;
|
127 |
diff = {};
|
128 |
cpu_sum = 0;
|
129 |
for key in (this.NEW_RAW.keys()):
|
130 |
#this.logger.log(Logger.DEBUG, "key = " + key);
|
131 |
if key.startswith('cpu_') or key.startswith('pages_') or key.startswith('swap_'):
|
132 |
#this.logger.log(Logger.DEBUG, "old = "+`this.OLD_RAW[key]`+" new = "+`this.NEW_RAW[key]`);
|
133 |
#diff[key] = this.NEW_RAW[key] - this.OLD_RAW[key];
|
134 |
diff[key] = this.diffWithOverflowCheck(this.NEW_RAW[key],this.OLD_RAW[key]);
|
135 |
if key.startswith('cpu_'):
|
136 |
#this.logger.log(Logger.DEBUG, "diff=" + `diff[key]`);
|
137 |
cpu_sum += diff[key];
|
138 |
for key in ('cpu_usr', 'cpu_nice', 'cpu_sys', 'cpu_idle'):
|
139 |
this.DATA[key] = 100.0 * diff[key] / cpu_sum;
|
140 |
this.DATA['cpu_usage'] = 100.0 * (diff['cpu_usr'] + diff['cpu_sys'] + diff['cpu_nice']) / cpu_sum;
|
141 |
|
142 |
if this.NEW_RAW.has_key('pages_in'):
|
143 |
now = int(time.time());
|
144 |
for key in ('pages_in', 'pages_out', 'swap_in', 'swap_out'):
|
145 |
this.DATA[key] = diff[key] / (now - this.LAST_UPDATE_TIME);
|
146 |
|
147 |
# sizes are reported in MB (except _usage that is in percent).
|
148 |
def readMemInfo (this):
|
149 |
try:
|
150 |
FMEM = open('/proc/meminfo');
|
151 |
line = FMEM.readline();
|
152 |
while(line != ''):
|
153 |
elem = re.split("\s+", line);
|
154 |
if(line.startswith("MemFree:")):
|
155 |
this.DATA['mem_free'] = float(elem[1]) / 1024.0;
|
156 |
if(line.startswith("MemTotal:")):
|
157 |
this.DATA['total_mem'] = float(elem[1]) / 1024.0;
|
158 |
if(line.startswith("SwapFree:")):
|
159 |
this.DATA['swap_free'] = float(elem[1]) / 1024.0;
|
160 |
if(line.startswith("SwapTotal:")):
|
161 |
this.DATA['total_swap'] = float(elem[1]) / 1024.0;
|
162 |
line = FMEM.readline();
|
163 |
FMEM.close();
|
164 |
this.DATA['mem_used'] = this.DATA['total_mem'] - this.DATA['mem_free'];
|
165 |
this.DATA['swap_used'] = this.DATA['total_swap'] - this.DATA['swap_free'];
|
166 |
this.DATA['mem_usage'] = 100.0 * this.DATA['mem_used'] / this.DATA['total_mem'];
|
167 |
this.DATA['swap_usage'] = 100.0 * this.DATA['swap_used'] / this.DATA['total_swap'];
|
168 |
except IOError, ex:
|
169 |
this.logger.log(Logger.ERROR, "ProcInfo: cannot open /proc/meminfo");
|
170 |
return;
|
171 |
|
172 |
# read system load average
|
173 |
def readLoadAvg (this):
|
174 |
try:
|
175 |
FAVG = open('/proc/loadavg');
|
176 |
line = FAVG.readline();
|
177 |
FAVG.close();
|
178 |
elem = re.split("\s+", line);
|
179 |
this.DATA['load1'] = float(elem[0]);
|
180 |
this.DATA['load5'] = float(elem[1]);
|
181 |
this.DATA['load15'] = float(elem[2]);
|
182 |
except IOError, ex:
|
183 |
this.logger.log(Logger.ERROR, "ProcInfo: cannot open /proc/meminfo");
|
184 |
return;
|
185 |
|
186 |
# read the number of processes currently running on the system
|
187 |
def countProcesses (this):
|
188 |
nr = 0;
|
189 |
try:
|
190 |
for file in os.listdir("/proc"):
|
191 |
if re.match("\d+", file):
|
192 |
nr += 1;
|
193 |
this.DATA['processes'] = nr;
|
194 |
except IOError, ex:
|
195 |
this.logger.log(Logger.ERROR, "ProcInfo: cannot open /proc to count processes");
|
196 |
return;
|
197 |
|
198 |
# reads the IP, hostname, cpu_MHz, uptime
|
199 |
def readGenericInfo (this):
|
200 |
this.DATA['hostname'] = socket.getfqdn();
|
201 |
try:
|
202 |
output = os.popen('/sbin/ifconfig -a')
|
203 |
eth, ip = '', '';
|
204 |
line = output.readline();
|
205 |
while(line != ''):
|
206 |
line = line.strip();
|
207 |
if line.startswith("eth"):
|
208 |
elem = line.split();
|
209 |
eth = elem[0];
|
210 |
ip = '';
|
211 |
if len(eth) > 0 and line.startswith("inet addr:"):
|
212 |
ip = re.match("inet addr:(\d+\.\d+\.\d+\.\d+)", line).group(1);
|
213 |
this.DATA[eth + '_ip'] = ip;
|
214 |
eth = '';
|
215 |
line = output.readline();
|
216 |
output.close();
|
217 |
except IOError, ex:
|
218 |
this.logger.log(Logger.ERROR, "ProcInfo: cannot get output from /sbin/ifconfig -a");
|
219 |
return;
|
220 |
try:
|
221 |
no_cpus = 0;
|
222 |
FCPU = open('/proc/cpuinfo');
|
223 |
line = FCPU.readline();
|
224 |
while(line != ''):
|
225 |
if line.startswith("cpu MHz"):
|
226 |
this.DATA['cpu_MHz'] = float(re.match("cpu MHz\s+:\s+(\d+\.?\d*)", line).group(1));
|
227 |
no_cpus += 1;
|
228 |
|
229 |
if line.startswith("vendor_id"):
|
230 |
this.DATA['cpu_vendor_id'] = re.match("vendor_id\s+:\s+(.+)", line).group(1);
|
231 |
|
232 |
if line.startswith("cpu family"):
|
233 |
this.DATA['cpu_family'] = re.match("cpu family\s+:\s+(.+)", line).group(1);
|
234 |
|
235 |
if line.startswith("model") and not line.startswith("model name") :
|
236 |
this.DATA['cpu_model'] = re.match("model\s+:\s+(.+)", line).group(1);
|
237 |
|
238 |
if line.startswith("model name"):
|
239 |
this.DATA['cpu_model_name'] = re.match("model name\s+:\s+(.+)", line).group(1);
|
240 |
|
241 |
if line.startswith("bogomips"):
|
242 |
this.DATA['bogomips'] = float(re.match("bogomips\s+:\s+(\d+\.?\d*)", line).group(1));
|
243 |
|
244 |
line = FCPU.readline();
|
245 |
FCPU.close();
|
246 |
this.DATA['no_CPUs'] = no_cpus;
|
247 |
except IOError, ex:
|
248 |
this.logger.log(Logger.ERROR, "ProcInfo: cannot open /proc/cpuinfo");
|
249 |
return;
|
250 |
try:
|
251 |
FUPT = open('/proc/uptime');
|
252 |
line = FUPT.readline();
|
253 |
FUPT.close();
|
254 |
elem = line.split();
|
255 |
this.DATA['uptime'] = float(elem[0]) / (24.0 * 3600);
|
256 |
except IOError, ex:
|
257 |
this.logger.log(Logger.ERROR, "ProcInfo: cannot open /proc/uptime");
|
258 |
return;
|
259 |
|
260 |
# do a difference with overflow check and repair
|
261 |
# the counter is unsigned 32 or 64 bit
|
262 |
def diffWithOverflowCheck(this, new, old):
|
263 |
if new >= old:
|
264 |
return new - old;
|
265 |
else:
|
266 |
max = (1L << 31) * 2; # 32 bits
|
267 |
if old >= max:
|
268 |
max = (1L << 63) * 2; # 64 bits
|
269 |
return new - old + max;
|
270 |
|
271 |
# read network information like transfered kBps and nr. of errors on each interface
|
272 |
def readNetworkInfo (this):
|
273 |
this.OLD_RAW = this.NEW_RAW;
|
274 |
interval = int(time.time()) - this.LAST_UPDATE_TIME;
|
275 |
try:
|
276 |
FNET = open('/proc/net/dev');
|
277 |
line = FNET.readline();
|
278 |
while(line != ''):
|
279 |
m = re.match("\s*eth(\d):(\d+)\s+\d+\s+(\d+)\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+(\d+)\s+\d+\s+(\d+)", line);
|
280 |
if m != None:
|
281 |
this.NEW_RAW['eth'+m.group(1)+'_in'] = float(m.group(2));
|
282 |
this.NEW_RAW['eth'+m.group(1)+'_out'] = float(m.group(4));
|
283 |
this.DATA['eth'+m.group(1)+'_errs'] = int(m.group(3)) + int(m.group(5));
|
284 |
if(this.OLD_RAW.has_key('eth'+m.group(1)+"_in")):
|
285 |
#this.logger.log(Logger.DEBUG, "Net I/O eth"+m.group(1)+" interval = "+ `interval`);
|
286 |
Bps = this.diffWithOverflowCheck(this.NEW_RAW['eth'+m.group(1)+'_in'], this.OLD_RAW['eth'+m.group(1)+'_in']) / interval;
|
287 |
this.DATA['eth'+m.group(1)+'_in'] = Bps / 1024.0;
|
288 |
Bps = this.diffWithOverflowCheck(this.NEW_RAW['eth'+m.group(1)+'_out'], this.OLD_RAW['eth'+m.group(1)+'_out']) / interval;
|
289 |
this.DATA['eth'+m.group(1)+'_out'] = Bps / 1024.0;
|
290 |
line = FNET.readline();
|
291 |
FNET.close();
|
292 |
except IOError, ex:
|
293 |
this.logger.log(Logger.ERROR, "ProcInfo: cannot open /proc/net/dev");
|
294 |
return;
|
295 |
|
296 |
##############################################################################################
|
297 |
# job monitoring related functions
|
298 |
##############################################################################################
|
299 |
|
300 |
# internal function that gets the full list of children (pids) for a process (pid)
|
301 |
def getChildren (this, parent):
|
302 |
pidmap = {};
|
303 |
try:
|
304 |
output = os.popen('ps --no-headers -eo "pid ppid"');
|
305 |
line = output.readline();
|
306 |
while(line != ''):
|
307 |
line = line.strip();
|
308 |
elem = re.split("\s+", line);
|
309 |
pidmap[elem[0]] = elem[1];
|
310 |
line = output.readline();
|
311 |
output.close();
|
312 |
except IOError, ex:
|
313 |
this.logger.log(Logger.ERROR, "ProcInfo: cannot execute ps --no-headers -eo \"pid ppid\"");
|
314 |
|
315 |
if pidmap.has_key(parent):
|
316 |
this.logger.log(Logger.INFO, 'ProcInfo: No job with pid='+str(parent));
|
317 |
this.removeJobToMonitor(parent);
|
318 |
return [];
|
319 |
|
320 |
children = [parent];
|
321 |
i = 0;
|
322 |
while(i < len(children)):
|
323 |
prnt = children[i];
|
324 |
for (pid, ppid) in pidmap.items():
|
325 |
if ppid == prnt:
|
326 |
children.append(pid);
|
327 |
i += 1;
|
328 |
return children;
|
329 |
|
330 |
# internal function that parses a time formatted like "days-hours:min:sec" and returns the corresponding
|
331 |
# number of seconds.
|
332 |
def parsePSTime (this, my_time):
|
333 |
my_time = my_time.strip();
|
334 |
m = re.match("(\d+)-(\d+):(\d+):(\d+)", my_time);
|
335 |
if m != None:
|
336 |
return int(m.group(1)) * 24 * 3600 + int(m.group(2)) * 3600 + int(m.group(3)) * 60 + int(m.group(4));
|
337 |
else:
|
338 |
m = re.match("(\d+):(\d+):(\d+)", my_time);
|
339 |
if(m != None):
|
340 |
return int(m.group(1)) * 3600 + int(m.group(2)) * 60 + int(m.group(3));
|
341 |
else:
|
342 |
m = re.match("(\d+):(\d+)", my_time);
|
343 |
if(m != None):
|
344 |
return int(m.group(1)) * 60 + int(m.group(2));
|
345 |
else:
|
346 |
return 0;
|
347 |
|
348 |
# read information about this the JOB_PID process
|
349 |
# memory sizes are given in KB
|
350 |
def readJobInfo (this, pid):
|
351 |
if (pid == '') or not this.JOBS.has_key(pid):
|
352 |
return;
|
353 |
children = this.getChildren(pid);
|
354 |
if(len(children) == 0):
|
355 |
this.logger.log(Logger.INFO, "ProcInfo: Job with pid="+str(pid)+" terminated; removing it from monitored jobs.");
|
356 |
#print ":("
|
357 |
this.removeJobToMonitor(pid);
|
358 |
return;
|
359 |
try:
|
360 |
JSTATUS = os.popen("ps --no-headers --pid " + ",".join([`child` for child in children]) + " -o pid,etime,time,%cpu,%mem,rsz,vsz,comm");
|
361 |
mem_cmd_map = {};
|
362 |
etime, cputime, pcpu, pmem, rsz, vsz, comm, fd = 0, 0, 0, 0, 0, 0, 0, 0;
|
363 |
line = JSTATUS.readline();
|
364 |
while(line != ''):
|
365 |
line = line.strip();
|
366 |
m = re.match("(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(.+)", line);
|
367 |
if m != None:
|
368 |
apid, etime1, cputime1, pcpu1, pmem1, rsz1, vsz1, comm1 = m.group(1), m.group(2), m.group(3), m.group(4), m.group(5), m.group(6), m.group(7), m.group(8);
|
369 |
sec = this.parsePSTime(etime1);
|
370 |
if sec > etime: # the elapsed time is the maximum of all elapsed
|
371 |
etime = sec;
|
372 |
sec = this.parsePSTime(cputime1); # times corespornding to all child processes.
|
373 |
cputime += sec; # total cputime is the sum of cputimes for all processes.
|
374 |
pcpu += float(pcpu1); # total %cpu is the sum of all children %cpu.
|
375 |
if not mem_cmd_map.has_key(`pmem1`+" "+`rsz1`+" "+`vsz1`+" "+`comm1`):
|
376 |
# it's the first thread/process with this memory footprint; add it.
|
377 |
mem_cmd_map[`pmem1`+" "+`rsz1`+" "+`vsz1`+" "+`comm1`] = 1;
|
378 |
pmem += float(pmem1); rsz += int(rsz1); vsz += int(vsz1);
|
379 |
fd += this.countOpenFD(apid);
|
380 |
# else not adding memory usage
|
381 |
line = JSTATUS.readline();
|
382 |
JSTATUS.close();
|
383 |
this.JOBS[pid]['DATA']['run_time'] = etime;
|
384 |
this.JOBS[pid]['DATA']['cpu_time'] = cputime;
|
385 |
this.JOBS[pid]['DATA']['cpu_usage'] = pcpu;
|
386 |
this.JOBS[pid]['DATA']['mem_usage'] = pmem;
|
387 |
this.JOBS[pid]['DATA']['rss'] = rsz;
|
388 |
this.JOBS[pid]['DATA']['virtualmem'] = vsz;
|
389 |
this.JOBS[pid]['DATA']['open_files'] = fd;
|
390 |
except IOError, ex:
|
391 |
this.logger.log(Logger.ERROR, "ProcInfo: cannot execute ps --no-headers -eo \"pid ppid\"");
|
392 |
|
393 |
# count the number of open files for the given pid
|
394 |
def countOpenFD (this, pid):
|
395 |
dir = '/proc/'+str(pid)+'/fd';
|
396 |
if os.access(dir, os.F_OK):
|
397 |
if os.access(dir, os.X_OK):
|
398 |
list = os.listdir(dir);
|
399 |
open_files = len(list);
|
400 |
if pid == os.getpid():
|
401 |
open_files -= 2;
|
402 |
this.logger.log(Logger.DEBUG, "Counting open_files for "+ `pid` +": "+ str(len(list)) +" => " + `open_files` + " open_files");
|
403 |
return open_files;
|
404 |
else:
|
405 |
this.logger.log(Logger.ERROR, "ProcInfo: cannot count the number of opened files for job "+`pid`);
|
406 |
else:
|
407 |
this.logger.log(Logger.ERROR, "ProcInfo: job "+`pid`+" dosen't exist");
|
408 |
|
409 |
|
410 |
# if there is an work directory defined, then compute the used space in that directory
|
411 |
# and the free disk space on the partition to which that directory belongs
|
412 |
# sizes are given in MB
|
413 |
def readJobDiskUsage (this, pid):
|
414 |
if (pid == '') or not this.JOBS.has_key(pid):
|
415 |
return;
|
416 |
workDir = this.JOBS[pid]['WORKDIR'];
|
417 |
if workDir == '':
|
418 |
return;
|
419 |
try:
|
420 |
DU = os.popen("du -Lscm " + workDir + " | tail -1 | cut -f 1");
|
421 |
line = DU.readline();
|
422 |
this.JOBS[pid]['DATA']['workdir_size'] = int(line);
|
423 |
except IOError, ex:
|
424 |
this.logger.log(Logger.ERROR, "ERROR", "ProcInfo: cannot run du to get job's disk usage for job "+`pid`);
|
425 |
try:
|
426 |
DF = os.popen("df -m "+workDir+" | tail -1");
|
427 |
line = DF.readline().strip();
|
428 |
m = re.match("\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)%", line);
|
429 |
if m != None:
|
430 |
this.JOBS[pid]['DATA']['disk_total'] = m.group(1);
|
431 |
this.JOBS[pid]['DATA']['disk_used'] = m.group(2);
|
432 |
this.JOBS[pid]['DATA']['disk_free'] = m.group(3);
|
433 |
this.JOBS[pid]['DATA']['disk_usage'] = m.group(4);
|
434 |
DF.close();
|
435 |
except IOError, ex:
|
436 |
this.logger.log(Logger.ERROR, "ERROR", "ProcInfo: cannot run df to get job's disk usage for job "+`pid`);
|
437 |
|
438 |
# Return a hash containing (param,value) pairs with existing values from the requested ones
|
439 |
def getFilteredData (this, dataHash, paramsList):
|
440 |
result = {};
|
441 |
for param in paramsList:
|
442 |
m = re.match("^net_(.*)$", param);
|
443 |
if m == None:
|
444 |
m = re.match("^(ip)$", param);
|
445 |
if m != None:
|
446 |
net_param = m.group(1);
|
447 |
#this.logger.log(Logger.DEBUG, "Querying param "+net_param);
|
448 |
for key, value in dataHash.items():
|
449 |
m = re.match("eth\d_"+net_param, key);
|
450 |
if m != None:
|
451 |
result[key] = value;
|
452 |
else:
|
453 |
if dataHash.has_key(param):
|
454 |
result[param] = dataHash[param];
|
455 |
return result;
|
456 |
|
457 |
######################################################################################
|
458 |
# self test
|
459 |
|
460 |
if __name__ == '__main__':
|
461 |
logger = Logger.Logger(Logger.DEBUG);
|
462 |
pi = ProcInfo(logger);
|
463 |
|
464 |
print "first update";
|
465 |
pi.update();
|
466 |
print "Sleeping to accumulate";
|
467 |
time.sleep(1);
|
468 |
pi.update();
|
469 |
|
470 |
print "System Monitoring:";
|
471 |
sys_cpu_params = ['cpu_usr', 'cpu_sys', 'cpu_idle', 'cpu_nice', 'cpu_usage'];
|
472 |
sys_2_4_params = ['pages_in', 'pages_out', 'swap_in', 'swap_out'];
|
473 |
sys_mem_params = ['mem_used', 'mem_free', 'total_mem', 'mem_usage'];
|
474 |
sys_swap_params = ['swap_used', 'swap_free', 'total_swap', 'swap_usage'];
|
475 |
sys_load_params = ['load1', 'load5', 'load15', 'processes', 'uptime'];
|
476 |
sys_gen_params = ['hostname', 'cpu_MHz', 'no_CPUs', 'cpu_vendor_id', 'cpu_family', 'cpu_model', 'cpu_model_name', 'bogomips'];
|
477 |
sys_net_params = ['net_in', 'net_out', 'net_errs', 'ip'];
|
478 |
|
479 |
print "sys_cpu_params", pi.getSystemData(sys_cpu_params);
|
480 |
print "sys_2_4_params", pi.getSystemData(sys_2_4_params);
|
481 |
print "sys_mem_params", pi.getSystemData(sys_mem_params);
|
482 |
print "sys_swap_params", pi.getSystemData(sys_swap_params);
|
483 |
print "sys_load_params", pi.getSystemData(sys_load_params);
|
484 |
print "sys_gen_params", pi.getSystemData(sys_gen_params);
|
485 |
print "sys_net_params", pi.getSystemData(sys_net_params);
|
486 |
|
487 |
job_pid = os.getpid();
|
488 |
|
489 |
print "Job (mysefl) monitoring:";
|
490 |
pi.addJobToMonitor(job_pid, os.getcwd());
|
491 |
print "Sleep another second";
|
492 |
time.sleep(1);
|
493 |
pi.update();
|
494 |
|
495 |
job_cpu_params = ['run_time', 'cpu_time', 'cpu_usage'];
|
496 |
job_mem_params = ['mem_usage', 'rss', 'virtualmem', 'open_files'];
|
497 |
job_disk_params = ['workdir_size', 'disk_used', 'disk_free', 'disk_total', 'disk_usage'];
|
498 |
time.sleep(10);
|
499 |
print "job_cpu_params", pi.getJobData(job_pid, job_cpu_params);
|
500 |
print "job_mem_params", pi.getJobData(job_pid, job_mem_params);
|
501 |
print "job_disk_params", pi.getJobData(job_pid, job_disk_params);
|
502 |
|
503 |
pi.removeJobToMonitor(os.getpid());
|