74 |
|
this.JOBS[pid] = {}; |
75 |
|
this.JOBS[pid]['WORKDIR'] = workDir; |
76 |
|
this.JOBS[pid]['DATA'] = {}; |
77 |
< |
print this.JOBS; |
77 |
> |
#print this.JOBS; |
78 |
|
|
79 |
|
# Call this to stop monitoring a PID |
80 |
|
def removeJobToMonitor (this, pid): |
130 |
|
#this.logger.log(Logger.DEBUG, "key = " + key); |
131 |
|
if key.startswith('cpu_') or key.startswith('pages_') or key.startswith('swap_'): |
132 |
|
#this.logger.log(Logger.DEBUG, "old = "+`this.OLD_RAW[key]`+" new = "+`this.NEW_RAW[key]`); |
133 |
< |
diff[key] = this.NEW_RAW[key] - this.OLD_RAW[key]; |
133 |
> |
#diff[key] = this.NEW_RAW[key] - this.OLD_RAW[key]; |
134 |
> |
diff[key] = this.diffWithOverflowCheck(this.NEW_RAW[key],this.OLD_RAW[key]); |
135 |
|
if key.startswith('cpu_'): |
136 |
|
#this.logger.log(Logger.DEBUG, "diff=" + `diff[key]`); |
137 |
|
cpu_sum += diff[key]; |
225 |
|
if line.startswith("cpu MHz"): |
226 |
|
this.DATA['cpu_MHz'] = float(re.match("cpu MHz\s+:\s+(\d+\.?\d*)", line).group(1)); |
227 |
|
no_cpus += 1; |
228 |
+ |
|
229 |
+ |
if line.startswith("vendor_id"): |
230 |
+ |
this.DATA['cpu_vendor_id'] = re.match("vendor_id\s+:\s+(.+)", line).group(1); |
231 |
+ |
|
232 |
+ |
if line.startswith("cpu family"): |
233 |
+ |
this.DATA['cpu_family'] = re.match("cpu family\s+:\s+(.+)", line).group(1); |
234 |
+ |
|
235 |
+ |
if line.startswith("model") and not line.startswith("model name") : |
236 |
+ |
this.DATA['cpu_model'] = re.match("model\s+:\s+(.+)", line).group(1); |
237 |
+ |
|
238 |
+ |
if line.startswith("model name"): |
239 |
+ |
this.DATA['cpu_model_name'] = re.match("model name\s+:\s+(.+)", line).group(1); |
240 |
+ |
|
241 |
+ |
if line.startswith("bogomips"): |
242 |
+ |
this.DATA['bogomips'] = float(re.match("bogomips\s+:\s+(\d+\.?\d*)", line).group(1)); |
243 |
+ |
|
244 |
|
line = FCPU.readline(); |
245 |
|
FCPU.close(); |
246 |
|
this.DATA['no_CPUs'] = no_cpus; |
257 |
|
this.logger.log(Logger.ERROR, "ProcInfo: cannot open /proc/uptime"); |
258 |
|
return; |
259 |
|
|
260 |
+ |
# do a difference with overflow check and repair |
261 |
+ |
# the counter is unsigned 32 or 64 bit |
262 |
+ |
def diffWithOverflowCheck(this, new, old): |
263 |
+ |
if new >= old: |
264 |
+ |
return new - old; |
265 |
+ |
else: |
266 |
+ |
max = (1L << 31) * 2; # 32 bits |
267 |
+ |
if old >= max: |
268 |
+ |
max = (1L << 63) * 2; # 64 bits |
269 |
+ |
return new - old + max; |
270 |
+ |
|
271 |
|
# read network information like transfered kBps and nr. of errors on each interface |
272 |
|
def readNetworkInfo (this): |
273 |
|
this.OLD_RAW = this.NEW_RAW; |
283 |
|
this.DATA['eth'+m.group(1)+'_errs'] = int(m.group(3)) + int(m.group(5)); |
284 |
|
if(this.OLD_RAW.has_key('eth'+m.group(1)+"_in")): |
285 |
|
#this.logger.log(Logger.DEBUG, "Net I/O eth"+m.group(1)+" interval = "+ `interval`); |
286 |
< |
Bps = (this.NEW_RAW['eth'+m.group(1)+'_in'] - this.OLD_RAW['eth'+m.group(1)+'_in']) / interval; |
286 |
> |
Bps = this.diffWithOverflowCheck(this.NEW_RAW['eth'+m.group(1)+'_in'], this.OLD_RAW['eth'+m.group(1)+'_in']) / interval; |
287 |
|
this.DATA['eth'+m.group(1)+'_in'] = Bps / 1024.0; |
288 |
< |
Bps = (this.NEW_RAW['eth'+m.group(1)+'_out'] - this.OLD_RAW['eth'+m.group(1)+'_out']) / interval; |
288 |
> |
Bps = this.diffWithOverflowCheck(this.NEW_RAW['eth'+m.group(1)+'_out'], this.OLD_RAW['eth'+m.group(1)+'_out']) / interval; |
289 |
|
this.DATA['eth'+m.group(1)+'_out'] = Bps / 1024.0; |
290 |
|
line = FNET.readline(); |
291 |
|
FNET.close(); |
312 |
|
except IOError, ex: |
313 |
|
this.logger.log(Logger.ERROR, "ProcInfo: cannot execute ps --no-headers -eo \"pid ppid\""); |
314 |
|
|
315 |
< |
if not pidmap.has_key(parent): |
315 |
> |
if pidmap.has_key(parent): |
316 |
|
this.logger.log(Logger.INFO, 'ProcInfo: No job with pid='+str(parent)); |
317 |
|
this.removeJobToMonitor(parent); |
318 |
|
return []; |
324 |
|
for (pid, ppid) in pidmap.items(): |
325 |
|
if ppid == prnt: |
326 |
|
children.append(pid); |
327 |
< |
i += 1; |
327 |
> |
i += 1; |
328 |
|
return children; |
329 |
|
|
330 |
|
# internal function that parses a time formatted like "days-hours:min:sec" and returns the corresponding |
331 |
|
# number of seconds. |
332 |
|
def parsePSTime (this, my_time): |
333 |
|
my_time = my_time.strip(); |
334 |
+ |
m = re.match("(\d+)-(\d+):(\d+):(\d+)", my_time); |
335 |
|
if m != None: |
307 |
– |
m = re.match("(\d+)-(\d+):(\d+):(\d+)", my_time); |
336 |
|
return int(m.group(1)) * 24 * 3600 + int(m.group(2)) * 3600 + int(m.group(3)) * 60 + int(m.group(4)); |
337 |
|
else: |
338 |
|
m = re.match("(\d+):(\d+):(\d+)", my_time); |
348 |
|
# read information about this the JOB_PID process |
349 |
|
# memory sizes are given in KB |
350 |
|
def readJobInfo (this, pid): |
351 |
< |
if (pid == '') or this.JOBS.has_key(pid): |
351 |
> |
if (pid == '') or not this.JOBS.has_key(pid): |
352 |
|
return; |
353 |
|
children = this.getChildren(pid); |
354 |
|
if(len(children) == 0): |
355 |
|
this.logger.log(Logger.INFO, "ProcInfo: Job with pid="+str(pid)+" terminated; removing it from monitored jobs."); |
356 |
+ |
#print ":(" |
357 |
|
this.removeJobToMonitor(pid); |
358 |
|
return; |
359 |
|
try: |
360 |
< |
JSTATUS = os.popen("ps --no-headers --pid " + ",".join([`child` for child in children]) + " -o etime,time,%cpu,%mem,rsz,vsz,comm"); |
360 |
> |
JSTATUS = os.popen("ps --no-headers --pid " + ",".join([`child` for child in children]) + " -o pid,etime,time,%cpu,%mem,rsz,vsz,comm"); |
361 |
|
mem_cmd_map = {}; |
362 |
< |
etime, cputime, pcpu, pmem, rsz, vsz, comm = 0, 0, 0, 0, 0, 0, 0; |
362 |
> |
etime, cputime, pcpu, pmem, rsz, vsz, comm, fd = 0, 0, 0, 0, 0, 0, 0, 0; |
363 |
|
line = JSTATUS.readline(); |
364 |
|
while(line != ''): |
365 |
|
line = line.strip(); |
366 |
< |
m = re.match("(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(.+)", line); |
366 |
> |
m = re.match("(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(.+)", line); |
367 |
|
if m != None: |
368 |
< |
etime1, cputime1, pcpu1, pmem1, rsz1, vsz1, comm1 = m.group(1), m.group(2), m.group(3), m.group(4), m.group(5), m.group(6), m.group(7); |
368 |
> |
apid, etime1, cputime1, pcpu1, pmem1, rsz1, vsz1, comm1 = m.group(1), m.group(2), m.group(3), m.group(4), m.group(5), m.group(6), m.group(7), m.group(8); |
369 |
|
sec = this.parsePSTime(etime1); |
370 |
|
if sec > etime: # the elapsed time is the maximum of all elapsed |
371 |
|
etime = sec; |
376 |
|
# it's the first thread/process with this memory footprint; add it. |
377 |
|
mem_cmd_map[`pmem1`+" "+`rsz1`+" "+`vsz1`+" "+`comm1`] = 1; |
378 |
|
pmem += float(pmem1); rsz += int(rsz1); vsz += int(vsz1); |
379 |
+ |
fd += this.countOpenFD(apid); |
380 |
|
# else not adding memory usage |
381 |
|
line = JSTATUS.readline(); |
382 |
|
JSTATUS.close(); |
386 |
|
this.JOBS[pid]['DATA']['mem_usage'] = pmem; |
387 |
|
this.JOBS[pid]['DATA']['rss'] = rsz; |
388 |
|
this.JOBS[pid]['DATA']['virtualmem'] = vsz; |
389 |
+ |
this.JOBS[pid]['DATA']['open_files'] = fd; |
390 |
|
except IOError, ex: |
391 |
|
this.logger.log(Logger.ERROR, "ProcInfo: cannot execute ps --no-headers -eo \"pid ppid\""); |
392 |
< |
|
392 |
> |
|
393 |
> |
# count the number of open files for the given pid |
394 |
> |
def countOpenFD (this, pid): |
395 |
> |
dir = '/proc/'+str(pid)+'/fd'; |
396 |
> |
if os.access(dir, os.F_OK): |
397 |
> |
if os.access(dir, os.X_OK): |
398 |
> |
list = os.listdir(dir); |
399 |
> |
open_files = len(list); |
400 |
> |
if pid == os.getpid(): |
401 |
> |
open_files -= 2; |
402 |
> |
this.logger.log(Logger.DEBUG, "Counting open_files for "+ `pid` +": "+ str(len(list)) +" => " + `open_files` + " open_files"); |
403 |
> |
return open_files; |
404 |
> |
else: |
405 |
> |
this.logger.log(Logger.ERROR, "ProcInfo: cannot count the number of opened files for job "+`pid`); |
406 |
> |
else: |
407 |
> |
this.logger.log(Logger.ERROR, "ProcInfo: job "+`pid`+" dosen't exist"); |
408 |
> |
|
409 |
> |
|
410 |
|
# if there is an work directory defined, then compute the used space in that directory |
411 |
|
# and the free disk space on the partition to which that directory belongs |
412 |
|
# sizes are given in MB |
413 |
|
def readJobDiskUsage (this, pid): |
414 |
< |
if (pid == '') or this.JOBS.has_key(pid): |
414 |
> |
if (pid == '') or not this.JOBS.has_key(pid): |
415 |
|
return; |
416 |
|
workDir = this.JOBS[pid]['WORKDIR']; |
417 |
|
if workDir == '': |
418 |
|
return; |
419 |
|
try: |
420 |
< |
DU = os.popen("du -Lscm "+workDir+" | tail -1 | cut -f 1"); |
420 |
> |
DU = os.popen("du -Lscm " + workDir + " | tail -1 | cut -f 1"); |
421 |
|
line = DU.readline(); |
422 |
|
this.JOBS[pid]['DATA']['workdir_size'] = int(line); |
423 |
|
except IOError, ex: |
458 |
|
# self test |
459 |
|
|
460 |
|
if __name__ == '__main__': |
461 |
< |
logger = this.logger(Logger.DEBUG); |
461 |
> |
logger = Logger.Logger(Logger.DEBUG); |
462 |
|
pi = ProcInfo(logger); |
463 |
+ |
|
464 |
|
print "first update"; |
465 |
|
pi.update(); |
466 |
|
print "Sleeping to accumulate"; |
467 |
|
time.sleep(1); |
468 |
|
pi.update(); |
420 |
– |
print "System Monitoring:"; |
469 |
|
|
470 |
+ |
print "System Monitoring:"; |
471 |
|
sys_cpu_params = ['cpu_usr', 'cpu_sys', 'cpu_idle', 'cpu_nice', 'cpu_usage']; |
472 |
|
sys_2_4_params = ['pages_in', 'pages_out', 'swap_in', 'swap_out']; |
473 |
|
sys_mem_params = ['mem_used', 'mem_free', 'total_mem', 'mem_usage']; |
474 |
|
sys_swap_params = ['swap_used', 'swap_free', 'total_swap', 'swap_usage']; |
475 |
|
sys_load_params = ['load1', 'load5', 'load15', 'processes', 'uptime']; |
476 |
< |
sys_gen_params = ['hostname', 'cpu_MHz', 'no_CPUs']; |
476 |
> |
sys_gen_params = ['hostname', 'cpu_MHz', 'no_CPUs', 'cpu_vendor_id', 'cpu_family', 'cpu_model', 'cpu_model_name', 'bogomips']; |
477 |
|
sys_net_params = ['net_in', 'net_out', 'net_errs', 'ip']; |
478 |
|
|
479 |
|
print "sys_cpu_params", pi.getSystemData(sys_cpu_params); |
484 |
|
print "sys_gen_params", pi.getSystemData(sys_gen_params); |
485 |
|
print "sys_net_params", pi.getSystemData(sys_net_params); |
486 |
|
|
487 |
+ |
job_pid = os.getpid(); |
488 |
+ |
|
489 |
|
print "Job (mysefl) monitoring:"; |
490 |
< |
pi.addJobToMonitor(os.getpid(), os.getcwd()); |
490 |
> |
pi.addJobToMonitor(job_pid, os.getcwd()); |
491 |
|
print "Sleep another second"; |
492 |
|
time.sleep(1); |
493 |
|
pi.update(); |
494 |
|
|
495 |
|
job_cpu_params = ['run_time', 'cpu_time', 'cpu_usage']; |
496 |
< |
job_mem_params = ['mem_usage', 'rss', 'virtualmem']; |
496 |
> |
job_mem_params = ['mem_usage', 'rss', 'virtualmem', 'open_files']; |
497 |
|
job_disk_params = ['workdir_size', 'disk_used', 'disk_free', 'disk_total', 'disk_usage']; |
498 |
+ |
time.sleep(10); |
499 |
+ |
print "job_cpu_params", pi.getJobData(job_pid, job_cpu_params); |
500 |
+ |
print "job_mem_params", pi.getJobData(job_pid, job_mem_params); |
501 |
+ |
print "job_disk_params", pi.getJobData(job_pid, job_disk_params); |
502 |
|
|
448 |
– |
print "job_cpu_params", pi.getJobData(os.getpid(), job_cpu_params); |
449 |
– |
print "job_mem_params", pi.getJobData(os.getpid(), job_mem_params); |
450 |
– |
print "job_disk_params", pi.getJobData(os.getpid(), job_disk_params); |
451 |
– |
|
503 |
|
pi.removeJobToMonitor(os.getpid()); |
453 |
– |
|