ViewVC Help
View File | Revision Log | Show Annotations | Root Listing
root/cvsroot/COMP/CRAB/python/ProcInfo.py
Revision: 1.4
Committed: Fri Mar 28 17:18:46 2008 UTC (17 years, 1 month ago) by slacapra
Content type: text/x-python
Branch: MAIN
CVS Tags: CRAB_2_9_1, CRAB_2_9_1_pre2, CRAB_2_9_1_pre1, CRAB_2_9_0, CRAB_2_9_0_pre2, CRAB_2_9_0_pre1, CRAB_2_8_8, CRAB_2_8_8_pre1, CRAB_2_8_7_patch3, CRAB_2_8_7_patch2, CRAB_2_8_7_patch1, CRAB_2_8_7, CRAB_2_8_7_pre2, CRAB_2_8_7_pre1, CRAB_2_8_6, CRAB_2_8_6_pre1, CRAB_2_8_5_patch3, CRAB_2_8_5_patch2, CRAB_2_8_5_patch1, CRAB_2_8_5, CRAB_2_8_5_pre5, CRAB_2_8_5_pre4, CRAB_2_8_5_pre3, CRAB_2_8_4_patch3, CRAB_2_8_5_pre2, CRAB_2_8_4_patch2, CRAB_2_8_5_pre1, CRAB_2_8_4_patch1, CRAB_2_8_4, CRAB_2_8_4_pre5, CRAB_2_8_4_pre4, CRAB_2_8_4_pre3, CRAB_2_8_4_pre2, CRAB_2_8_4_pre1, CRAB_2_8_3, CRAB_2_8_3_pre4, CRAB_2_8_3_pre3, CRAB_2_8_3_pre2, CRAB_2_8_3_pre1, CRAB_2_8_2_patch1, CRAB_2_8_2, CRAB_2_8_2_pre5, CRAB_2_8_2_pre4, CRAB_2_8_2_pre3, CRAB_2_8_2_pre2, CRAB_2_8_2_pre1, CRAB_2_8_1, CRAB_2_8_0, CRAB_2_8_0_pre1, CRAB_2_7_10_pre3, CRAB_2_7_9_patch2_pre1, CRAB_2_7_10_pre2, CRAB_2_7_10_pre1, CRAB_2_7_9_patch1, CRAB_2_7_9, CRAB_2_7_9_pre5, CRAB_2_7_9_pre4, CRAB_2_7_9_pre3, CRAB_2_7_9_pre2, CRAB_2_7_8_patch2, CRAB_2_7_9_pre1, CRAB_2_7_8_patch2_pre1, CRAB_2_7_8_patch1, CRAB_2_7_8_patch1_pre1, CRAB_2_7_8, CRAB_2_7_8_pre3, CRAB_2_7_8_pre2, CRAB_2_7_8_dash3, CRAB_2_7_8_dash2, CRAB_2_7_8_dash, CRAB_2_7_7_patch1, CRAB_2_7_7_patch1_pre1, CRAB_2_7_8_pre1, CRAB_2_7_7, CRAB_2_7_7_pre2, CRAB_2_7_7_pre1, CRAB_2_7_6_patch1, CRAB_2_7_6, CRAB_2_7_6_pre1, CRAB_2_7_5_patch1, CRAB_2_7_5, CRAB_2_7_5_pre3, CRAB_2_7_5_pre2, CRAB_2_7_5_pre1, CRAB_2_7_4_patch1, CRAB_2_7_4, CRAB_2_7_4_pre6, CRAB_2_7_4_pre5, CRAB_2_7_4_pre4, CRAB_2_7_4_pre3, CRAB_2_7_4_pre2, CRAB_2_7_4_pre1, CRAB_2_7_3, CRAB_2_7_3_pre3, CRAB_2_7_3_pre3_beta, CRAB_2_7_3_pre2, CRAB_2_7_3_pre2_beta, CRAB_2_7_3_pre1, CRAB_2_7_3_beta3, CRAB_2_7_3_beta2, CRAB_2_7_3_beta1, CRAB_2_7_3_beta, CRAB_2_7_2_p1, CRAB_2_7_1_branch_firstMERGE, CRAB_2_7_2, CRAB_2_7_2_pre4, CRAB_2_7_2_pre3, CRAB_2_7_2_pre2, CRAB_2_7_2_pre1, CRAB_2_7_1, fede_170310, CRAB_2_7_1_pre12, CRAB_2_7_1_pre11, CRAB_2_7_1_pre10, CRAB_2_7_1_pre9, CRAB_LumiMask, CRAB_2_7_lumi, from_LimiMask, CRAB_2_7_1_pre8, CRAB_2_7_1_pre6, CRAB_2_7_1_pre5, CRAB_2_7_1_wmbs_pre4, CRAB_2_7_1_pre4, CRAB_2_7_1_pre3, CRAB_2_6_6_pre6, CRAB_2_7_1_pre2, CRAB_2_6_6_pre5, CRAB_2_7_1_pre1, CRAB_2_6_6_pre4, CRAB_2_6_6_pre3, CRAB_2_6_6_pre2, CRAB_2_6_6_check, CRAB_2_6_6, CRAB_2_6_6_pre1, CRAB_2_7_0, CRAB_2_6_5, CRAB_2_7_0_pre8, CRAB_2_6_5_pre1, CRAB_2_7_0_pre7, CRAB_2_6_4, CRAB_2_7_0_pre6, CRAB_2_6_4_pre1, CRAB_2_7_0_pre5, CRAB_2_6_3_patch_2, CRAB_2_6_3_patch_2_pre2, CRAB_2_6_3_patch_2_pre1, CRAB_2_6_3_patch_1, CRAB_2_7_0_pre4, CRAB_2_7_0_pre3, CRAB_2_6_3, CRAB_2_6_3_pre5, CRAB_2_6_3_pre4, CRAB_2_6_3_pre3, CRAB_2_6_3_pre2, CRAB_2_7_0_pre2, CRAB_2_6_3_pre1, test_1, CRAB_2_7_0_pre1, CRAB_2_6_2, CRAB_2_6_2_pre2, CRAB_2_6_2_pre1, CRAB_2_6_1_pre4, CRAB_2_6_1_pre3, CRAB_2_6_1_pre2, CRAB_2_6_1_pre1, CRAB_2_6_1, CRAB_2_6_0, CRAB_2_6_0_pre14, CRAB_2_6_0_pre13, CRAB_2_6_0_pre12, CRAB_2_6_0_pre11, CRAB_2_6_0_pre10, CRAB_2_6_0_pre9, CRAB_2_6_0_pre8, CRAB_2_6_0_pre7, CRAB_2_6_0_pre6, CRAB_2_6_0_pre5, CRAB_2_6_0_pre4, CRAB_2_6_0_pre3, CRAB_2_6_0_pre2, CRAB_2_6_0_pre1, CRAB_2_5_1, CRAB_2_5_1_pre4, CRAB_2_5_1_pre3, CRAB_2_5_1_pre2, CRAB_2_5_1_pre1, CRAB_2_5_0, CRAB_2_5_0_pre7, CRAB_2_5_0_pre6, CRAB_2_5_0_pre5, CRAB_2_5_0_pre4, CRAB_2_5_0_pre3, CRAB_2_5_0_pre2, CRAB_2_5_0_pre1, CRAB_2_4_4, CRAB_2_4_4_pre6, CRAB_2_4_4_pre5, CRAB_2_4_4_pre4, CRAB_2_4_4_pre3, CRAB_2_4_4_pre2, CRAB_2_4_4_pre1, CRAB_2_4_3, CRAB_2_4_3_pre8, CRAB_2_4_3_pre7, CRAB_2_4_3_pre6, CRAB_2_4_3_pre5, CRAB_2_4_3_pre3, CRAB_2_4_3_pre2, CRAB_2_4_3_pre1, CRAB_2_4_2, CRAB_2_4_2_pre3, CRAB_2_4_2_pre2, CRAB_2_4_2_pre1, CRAB_2_4_1, CRAB_2_4_1_pre4, CRAB_2_4_1_pre3, CRAB_2_4_1_pre2, CRAB_2_4_1_pre1, CRAB_2_4_0_Tutorial, CRAB_2_4_0_Tutorial_pre1, CRAB_2_4_0, CRAB_2_4_0_pre9, CRAB_2_4_0_pre8, CRAB_2_4_0_pre7, CRAB_2_4_0_pre6, CRAB_2_4_0_pre5, CRAB_2_4_0_pre4, CRAB_2_4_0_pre3, CRAB_2_4_0_pre2, CRAB_2_4_0_pre1, CRAB_DLS_PHED1, CRAB_DLS_PHED, CRAB_2_3_2_Fnal, CRAB_2_3_2, CRAB_2_3_2_pre7, CRAB_2_3_2_pre5, CRAB_2_3_2_pre4, CRAB_2_3_2_pre3, CRAB_2_3_2_pre2, CRAB_2_3_2_pre1, CRAB_2_4_0_test, CRAB_2_3_1, CRAB_2_3_1_pre6, CRAB_2_3_1_pre5, CRAB_2_3_1_pre4, CRAB_2_3_1_pre3, CRAB_2_3_1_pre2, CRAB_2_3_1_pre1, CRAB_2_3_0, CRAB_2_3_0_pre6, CRAB_2_3_0_pre1, CRAB_2_2_2_pre5, CRAB_2_2_2_pre4, CRAB_2_2_2_pre3, CRAB_2_2_2_pre2, CRAB_2_2_2_pre1, CRAB_2_2_1, CRAB_2_2_1_pre6, CRAB_2_2_1_pre5, CRAB_2_2_1_pre4, PRODCOMMON_0_10_7_testCS2, CRAB_2_2_1_pre3, CRAB_2_2_1_pre2, CRAB_2_2_1_pre1, CRAB_2_2_0, CRAB_2_2_0_pre21, CRAB_2_2_0_pre19, CRAB_2_2_0_pre18, CRAB_2_2_0_pre17, CRAB_2_2_0_pre16, CRAB_2_2_0_pre15, CRAB_2_2_0_pre13, CRAB_2_2_0_pre12, CRAB_2_2_0_pre11, CRAB_2_2_0_pre10, bp_osg_bdii, CRAB_2_2_0_pre9, CRAB_2_2_0_pre8, CRAB_2_2_0_pre7, CRAB_2_1_2, CRAB_2_2_0_pre5, CRAB_2_1_2_pre2, CRAB_2_1_2_pre1, CRAB_2_2_0_pre4, CRAB_2_2_0_pre2, CRAB_2_1_1, HEAD
Branch point for: CRAB_multiout, CRAB_2_7_1_branch, Lumi2_8, CRAB_2_6_X_br, AnaDataSet, CRAB_2_3_0_br, osg_bdii, CRAB_2_1_2_br
Changes since 1.3: +1 -1 lines
Log Message:
Upgrade of dashboard related classes to improve submission of monotoring infos for larg task: see https://hypernews.cern.ch/HyperNews/CMS/get/crabDevelopment/657/1/1/1/2/1/1.html

File Contents

# User Rev Content
1 corvo 1.1
2     """
3     * ApMon - Application Monitoring Tool
4 corvo 1.3 * Version: 2.2.1
5 corvo 1.1 *
6 corvo 1.3 * Copyright (C) 2006 California Institute of Technology
7 corvo 1.1 *
8     * Permission is hereby granted, free of charge, to use, copy and modify
9     * this software and its documentation (the "Software") for any
10     * purpose, provided that existing copyright notices are retained in
11     * all copies and that this notice is included verbatim in any distributions
12     * or substantial portions of the Software.
13     * This software is a part of the MonALISA framework (http://monalisa.cacr.caltech.edu).
14     * Users of the Software are asked to feed back problems, benefits,
15     * and/or suggestions about the software to the MonALISA Development Team
16     * (developers@monalisa.cern.ch). Support for this software - fixing of bugs,
17     * incorporation of new features - is done on a best effort basis. All bug
18     * fixes and enhancements will be made available under the same terms and
19     * conditions as the original software,
20    
21     * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY FOR
22     * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT
23     * OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY DERIVATIVES THEREOF,
24     * EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25    
26     * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
27     * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
28     * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE IS
29     * PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE NO
30     * OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
31     * MODIFICATIONS.
32     """
33    
34    
35     import os
36     import re
37     import time
38 corvo 1.3 import string
39 corvo 1.1 import socket
40     import Logger
41    
42     """
43     Class ProcInfo
44     extracts information from the proc/ filesystem for system and job monitoring
45     """
46     class ProcInfo:
47     # ProcInfo constructor
48     def __init__ (this, logger):
49     this.DATA = {}; # monitored data that is going to be reported
50     this.LAST_UPDATE_TIME = 0; # when the last measurement was done
51     this.JOBS = {}; # jobs that will be monitored
52     this.logger = logger # use the given logger
53 corvo 1.3 this.OS_TYPE = os.popen('uname -s').readline().replace('\n','');
54 corvo 1.1
55     # This should be called from time to time to update the monitored data,
56     # but not more often than once a second because of the resolution of time()
57     def update (this):
58     if this.LAST_UPDATE_TIME == int(time.time()):
59     this.logger.log(Logger.NOTICE, "ProcInfo: update() called too often!");
60     return;
61     this.readStat();
62     this.readMemInfo();
63 corvo 1.3 if this.OS_TYPE == 'Darwin':
64     this.darwin_readLoadAvg();
65     else:
66     this.readLoadAvg();
67 corvo 1.1 this.countProcesses();
68     this.readGenericInfo();
69     this.readNetworkInfo();
70 corvo 1.3 this.readNetStat();
71 corvo 1.1 for pid in this.JOBS.keys():
72     this.readJobInfo(pid);
73     this.readJobDiskUsage(pid);
74     this.LAST_UPDATE_TIME = int(time.time());
75 corvo 1.3 this.DATA['TIME'] = int(time.time());
76    
77 corvo 1.1 # Call this to add another PID to be monitored
78     def addJobToMonitor (this, pid, workDir):
79     this.JOBS[pid] = {};
80     this.JOBS[pid]['WORKDIR'] = workDir;
81     this.JOBS[pid]['DATA'] = {};
82 corvo 1.2 #print this.JOBS;
83 corvo 1.1
84     # Call this to stop monitoring a PID
85     def removeJobToMonitor (this, pid):
86     if this.JOBS.has_key(pid):
87     del this.JOBS[pid];
88    
89     # Return a filtered hash containting the system-related parameters and values
90 corvo 1.3 def getSystemData (this, params, prevDataRef):
91     return this.getFilteredData(this.DATA, params, prevDataRef);
92 corvo 1.1
93     # Return a filtered hash containing the job-related parameters and values
94     def getJobData (this, pid, params):
95     if not this.JOBS.has_key(pid):
96     return [];
97     return this.getFilteredData(this.JOBS[pid]['DATA'], params);
98    
99     ############################################################################################
100     # internal functions for system monitoring
101     ############################################################################################
102    
103     # this has to be run twice (with the $lastUpdateTime updated) to get some useful results
104     # the information about pages_in/out and swap_in/out isn't available for 2.6 kernels (yet)
105     def readStat (this):
106     try:
107     FSTAT = open('/proc/stat');
108     line = FSTAT.readline();
109     while(line != ''):
110     if(line.startswith("cpu ")):
111     elem = re.split("\s+", line);
112 corvo 1.3 this.DATA['raw_cpu_usr'] = float(elem[1]);
113     this.DATA['raw_cpu_nice'] = float(elem[2]);
114     this.DATA['raw_cpu_sys'] = float(elem[3]);
115     this.DATA['raw_cpu_idle'] = float(elem[4]);
116 corvo 1.1 if(line.startswith("page")):
117     elem = line.split();
118 corvo 1.3 this.DATA['raw_pages_in'] = float(elem[1]);
119     this.DATA['raw_pages_out'] = float(elem[2]);
120 corvo 1.1 if(line.startswith('swap')):
121     elem = line.split();
122 corvo 1.3 this.DATA['raw_swap_in'] = float(elem[1]);
123     this.DATA['raw_swap_out'] = float(elem[2]);
124 corvo 1.1 line = FSTAT.readline();
125     FSTAT.close();
126     except IOError, ex:
127     this.logger.log(Logger.ERROR, "ProcInfo: cannot open /proc/stat");
128     return;
129 corvo 1.3
130 corvo 1.1 # sizes are reported in MB (except _usage that is in percent).
131     def readMemInfo (this):
132     try:
133     FMEM = open('/proc/meminfo');
134     line = FMEM.readline();
135     while(line != ''):
136     elem = re.split("\s+", line);
137     if(line.startswith("MemFree:")):
138     this.DATA['mem_free'] = float(elem[1]) / 1024.0;
139     if(line.startswith("MemTotal:")):
140     this.DATA['total_mem'] = float(elem[1]) / 1024.0;
141     if(line.startswith("SwapFree:")):
142     this.DATA['swap_free'] = float(elem[1]) / 1024.0;
143     if(line.startswith("SwapTotal:")):
144     this.DATA['total_swap'] = float(elem[1]) / 1024.0;
145     line = FMEM.readline();
146     FMEM.close();
147 corvo 1.3 if this.DATA.has_key('total_mem') and this.DATA.has_key('mem_free'):
148     this.DATA['mem_used'] = this.DATA['total_mem'] - this.DATA['mem_free'];
149     if this.DATA.has_key('total_swap') and this.DATA.has_key('swap_free'):
150     this.DATA['swap_used'] = this.DATA['total_swap'] - this.DATA['swap_free'];
151     if this.DATA.has_key('mem_used') and this.DATA.has_key('total_mem') and this.DATA['total_mem'] > 0:
152     this.DATA['mem_usage'] = 100.0 * this.DATA['mem_used'] / this.DATA['total_mem'];
153     if this.DATA.has_key('swap_used') and this.DATA.has_key('total_swap') and this.DATA['total_swap'] > 0:
154     this.DATA['swap_usage'] = 100.0 * this.DATA['swap_used'] / this.DATA['total_swap'];
155 corvo 1.1 except IOError, ex:
156     this.logger.log(Logger.ERROR, "ProcInfo: cannot open /proc/meminfo");
157     return;
158    
159     # read system load average
160     def readLoadAvg (this):
161     try:
162     FAVG = open('/proc/loadavg');
163     line = FAVG.readline();
164     FAVG.close();
165     elem = re.split("\s+", line);
166     this.DATA['load1'] = float(elem[0]);
167     this.DATA['load5'] = float(elem[1]);
168     this.DATA['load15'] = float(elem[2]);
169     except IOError, ex:
170     this.logger.log(Logger.ERROR, "ProcInfo: cannot open /proc/meminfo");
171     return;
172    
173 corvo 1.3
174     # read system load average on Darwin
175     def darwin_readLoadAvg (this):
176     try:
177     LOAD_AVG = os.popen('sysctl vm.loadavg');
178     line = LOAD_AVG.readline();
179     LOAD_AVG.close();
180     elem = re.split("\s+", line);
181     this.DATA['load1'] = float(elem[1]);
182     this.DATA['load5'] = float(elem[2]);
183     this.DATA['load15'] = float(elem[3]);
184     except IOError, ex:
185     this.logger.log(Logger.ERROR, "ProcInfo: cannot run 'sysctl vm.loadavg");
186     return;
187    
188    
189 corvo 1.1 # read the number of processes currently running on the system
190     def countProcesses (this):
191 corvo 1.3 """
192     # old version
193 corvo 1.1 nr = 0;
194     try:
195     for file in os.listdir("/proc"):
196     if re.match("\d+", file):
197     nr += 1;
198     this.DATA['processes'] = nr;
199     except IOError, ex:
200     this.logger.log(Logger.ERROR, "ProcInfo: cannot open /proc to count processes");
201     return;
202 corvo 1.3 """
203     # new version
204     total = 0;
205     states = {'D':0, 'R':0, 'S':0, 'T':0, 'Z':0};
206     try:
207     output = os.popen('ps -A -o state');
208     line = output.readline();
209     while(line != ''):
210     states[line[0]] = states[line[0]] + 1;
211     total = total + 1;
212     line = output.readline();
213     output.close();
214     this.DATA['processes'] = total;
215     for key in states.keys():
216     this.DATA['processes_'+key] = states[key];
217     except IOError, ex:
218     this.logger.log(Logger.ERROR, "ProcInfo: cannot get output from ps command");
219     return;
220    
221 corvo 1.1 # reads the IP, hostname, cpu_MHz, uptime
222     def readGenericInfo (this):
223     this.DATA['hostname'] = socket.getfqdn();
224     try:
225     output = os.popen('/sbin/ifconfig -a')
226     eth, ip = '', '';
227     line = output.readline();
228     while(line != ''):
229     line = line.strip();
230     if line.startswith("eth"):
231     elem = line.split();
232     eth = elem[0];
233     ip = '';
234     if len(eth) > 0 and line.startswith("inet addr:"):
235     ip = re.match("inet addr:(\d+\.\d+\.\d+\.\d+)", line).group(1);
236     this.DATA[eth + '_ip'] = ip;
237     eth = '';
238     line = output.readline();
239     output.close();
240     except IOError, ex:
241     this.logger.log(Logger.ERROR, "ProcInfo: cannot get output from /sbin/ifconfig -a");
242     return;
243     try:
244     no_cpus = 0;
245     FCPU = open('/proc/cpuinfo');
246     line = FCPU.readline();
247     while(line != ''):
248     if line.startswith("cpu MHz"):
249     this.DATA['cpu_MHz'] = float(re.match("cpu MHz\s+:\s+(\d+\.?\d*)", line).group(1));
250     no_cpus += 1;
251 corvo 1.2
252     if line.startswith("vendor_id"):
253     this.DATA['cpu_vendor_id'] = re.match("vendor_id\s+:\s+(.+)", line).group(1);
254    
255     if line.startswith("cpu family"):
256     this.DATA['cpu_family'] = re.match("cpu family\s+:\s+(.+)", line).group(1);
257    
258     if line.startswith("model") and not line.startswith("model name") :
259     this.DATA['cpu_model'] = re.match("model\s+:\s+(.+)", line).group(1);
260    
261     if line.startswith("model name"):
262     this.DATA['cpu_model_name'] = re.match("model name\s+:\s+(.+)", line).group(1);
263    
264     if line.startswith("bogomips"):
265     this.DATA['bogomips'] = float(re.match("bogomips\s+:\s+(\d+\.?\d*)", line).group(1));
266    
267 corvo 1.1 line = FCPU.readline();
268     FCPU.close();
269     this.DATA['no_CPUs'] = no_cpus;
270     except IOError, ex:
271     this.logger.log(Logger.ERROR, "ProcInfo: cannot open /proc/cpuinfo");
272     return;
273     try:
274     FUPT = open('/proc/uptime');
275     line = FUPT.readline();
276     FUPT.close();
277     elem = line.split();
278     this.DATA['uptime'] = float(elem[0]) / (24.0 * 3600);
279     except IOError, ex:
280     this.logger.log(Logger.ERROR, "ProcInfo: cannot open /proc/uptime");
281     return;
282    
283 corvo 1.2 # do a difference with overflow check and repair
284     # the counter is unsigned 32 or 64 bit
285     def diffWithOverflowCheck(this, new, old):
286     if new >= old:
287     return new - old;
288     else:
289     max = (1L << 31) * 2; # 32 bits
290     if old >= max:
291     max = (1L << 63) * 2; # 64 bits
292     return new - old + max;
293    
294 corvo 1.1 # read network information like transfered kBps and nr. of errors on each interface
295     def readNetworkInfo (this):
296     try:
297     FNET = open('/proc/net/dev');
298     line = FNET.readline();
299     while(line != ''):
300     m = re.match("\s*eth(\d):(\d+)\s+\d+\s+(\d+)\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+(\d+)\s+\d+\s+(\d+)", line);
301     if m != None:
302 corvo 1.3 this.DATA['raw_eth'+m.group(1)+'_in'] = float(m.group(2));
303     this.DATA['raw_eth'+m.group(1)+'_out'] = float(m.group(4));
304     this.DATA['raw_eth'+m.group(1)+'_errs'] = int(m.group(3)) + int(m.group(5));
305 corvo 1.1 line = FNET.readline();
306     FNET.close();
307     except IOError, ex:
308     this.logger.log(Logger.ERROR, "ProcInfo: cannot open /proc/net/dev");
309     return;
310 corvo 1.3
311     # run nestat and collect sockets info (tcp, udp, unix) and connection states for tcp sockets from netstat
312     def readNetStat(this):
313     try:
314 slacapra 1.4 output = os.popen('netstat -an 2>/dev/null');
315 corvo 1.3 sockets = { 'sockets_tcp':0, 'sockets_udp':0, 'sockets_unix':0, 'sockets_icm':0 };
316     tcp_details = { 'sockets_tcp_ESTABLISHED':0, 'sockets_tcp_SYN_SENT':0,
317     'sockets_tcp_SYN_RECV':0, 'sockets_tcp_FIN_WAIT1':0, 'sockets_tcp_FIN_WAIT2':0,
318     'sockets_tcp_TIME_WAIT':0, 'sockets_tcp_CLOSED':0, 'sockets_tcp_CLOSE_WAIT':0,
319     'sockets_tcp_LAST_ACK':0, 'sockets_tcp_LISTEN':0, 'sockets_tcp_CLOSING':0,
320     'sockets_tcp_UNKNOWN':0 };
321     line = output.readline();
322     while(line != ''):
323     arg = string.split(line);
324     proto = arg[0];
325     if proto.find('tcp') == 0:
326     sockets['sockets_tcp'] += 1;
327     state = arg[len(arg)-1];
328     key = 'sockets_tcp_'+state;
329     if tcp_details.has_key(key):
330     tcp_details[key] += 1;
331     if proto.find('udp') == 0:
332     sockets['sockets_udp'] += 1;
333     if proto.find('unix') == 0:
334     sockets['sockets_unix'] += 1;
335     if proto.find('icm') == 0:
336     sockets['sockets_icm'] += 1;
337    
338     line = output.readline();
339     output.close();
340    
341     for key in sockets.keys():
342     this.DATA[key] = sockets[key];
343     for key in tcp_details.keys():
344     this.DATA[key] = tcp_details[key];
345     except IOError, ex:
346     this.logger.log(Logger.ERROR, "ProcInfo: cannot get output from netstat command");
347     return;
348    
349 corvo 1.1 ##############################################################################################
350     # job monitoring related functions
351     ##############################################################################################
352    
353     # internal function that gets the full list of children (pids) for a process (pid)
354     def getChildren (this, parent):
355     pidmap = {};
356     try:
357 corvo 1.3 output = os.popen('ps -A -o "pid ppid"');
358     line = output.readline(); # skip headers
359 corvo 1.1 line = output.readline();
360     while(line != ''):
361     line = line.strip();
362     elem = re.split("\s+", line);
363     pidmap[elem[0]] = elem[1];
364     line = output.readline();
365     output.close();
366     except IOError, ex:
367 corvo 1.3 this.logger.log(Logger.ERROR, "ProcInfo: cannot execute ps -A -o \"pid ppid\"");
368 corvo 1.1
369 corvo 1.2 if pidmap.has_key(parent):
370 corvo 1.1 this.logger.log(Logger.INFO, 'ProcInfo: No job with pid='+str(parent));
371     this.removeJobToMonitor(parent);
372     return [];
373    
374     children = [parent];
375     i = 0;
376     while(i < len(children)):
377     prnt = children[i];
378     for (pid, ppid) in pidmap.items():
379     if ppid == prnt:
380     children.append(pid);
381 corvo 1.2 i += 1;
382 corvo 1.1 return children;
383    
384     # internal function that parses a time formatted like "days-hours:min:sec" and returns the corresponding
385     # number of seconds.
386     def parsePSTime (this, my_time):
387     my_time = my_time.strip();
388 corvo 1.2 m = re.match("(\d+)-(\d+):(\d+):(\d+)", my_time);
389 corvo 1.1 if m != None:
390     return int(m.group(1)) * 24 * 3600 + int(m.group(2)) * 3600 + int(m.group(3)) * 60 + int(m.group(4));
391     else:
392     m = re.match("(\d+):(\d+):(\d+)", my_time);
393     if(m != None):
394     return int(m.group(1)) * 3600 + int(m.group(2)) * 60 + int(m.group(3));
395     else:
396     m = re.match("(\d+):(\d+)", my_time);
397     if(m != None):
398     return int(m.group(1)) * 60 + int(m.group(2));
399     else:
400     return 0;
401    
402     # read information about this the JOB_PID process
403     # memory sizes are given in KB
404     def readJobInfo (this, pid):
405 corvo 1.2 if (pid == '') or not this.JOBS.has_key(pid):
406 corvo 1.1 return;
407     children = this.getChildren(pid);
408     if(len(children) == 0):
409     this.logger.log(Logger.INFO, "ProcInfo: Job with pid="+str(pid)+" terminated; removing it from monitored jobs.");
410 corvo 1.2 #print ":("
411 corvo 1.1 this.removeJobToMonitor(pid);
412     return;
413     try:
414 corvo 1.2 JSTATUS = os.popen("ps --no-headers --pid " + ",".join([`child` for child in children]) + " -o pid,etime,time,%cpu,%mem,rsz,vsz,comm");
415 corvo 1.1 mem_cmd_map = {};
416 corvo 1.2 etime, cputime, pcpu, pmem, rsz, vsz, comm, fd = 0, 0, 0, 0, 0, 0, 0, 0;
417 corvo 1.1 line = JSTATUS.readline();
418     while(line != ''):
419     line = line.strip();
420 corvo 1.2 m = re.match("(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(.+)", line);
421 corvo 1.1 if m != None:
422 corvo 1.2 apid, etime1, cputime1, pcpu1, pmem1, rsz1, vsz1, comm1 = m.group(1), m.group(2), m.group(3), m.group(4), m.group(5), m.group(6), m.group(7), m.group(8);
423 corvo 1.1 sec = this.parsePSTime(etime1);
424     if sec > etime: # the elapsed time is the maximum of all elapsed
425     etime = sec;
426     sec = this.parsePSTime(cputime1); # times corespornding to all child processes.
427     cputime += sec; # total cputime is the sum of cputimes for all processes.
428     pcpu += float(pcpu1); # total %cpu is the sum of all children %cpu.
429     if not mem_cmd_map.has_key(`pmem1`+" "+`rsz1`+" "+`vsz1`+" "+`comm1`):
430     # it's the first thread/process with this memory footprint; add it.
431     mem_cmd_map[`pmem1`+" "+`rsz1`+" "+`vsz1`+" "+`comm1`] = 1;
432     pmem += float(pmem1); rsz += int(rsz1); vsz += int(vsz1);
433 corvo 1.2 fd += this.countOpenFD(apid);
434 corvo 1.1 # else not adding memory usage
435     line = JSTATUS.readline();
436     JSTATUS.close();
437     this.JOBS[pid]['DATA']['run_time'] = etime;
438     this.JOBS[pid]['DATA']['cpu_time'] = cputime;
439     this.JOBS[pid]['DATA']['cpu_usage'] = pcpu;
440     this.JOBS[pid]['DATA']['mem_usage'] = pmem;
441     this.JOBS[pid]['DATA']['rss'] = rsz;
442     this.JOBS[pid]['DATA']['virtualmem'] = vsz;
443 corvo 1.2 this.JOBS[pid]['DATA']['open_files'] = fd;
444 corvo 1.1 except IOError, ex:
445     this.logger.log(Logger.ERROR, "ProcInfo: cannot execute ps --no-headers -eo \"pid ppid\"");
446 corvo 1.2
447     # count the number of open files for the given pid
448     def countOpenFD (this, pid):
449     dir = '/proc/'+str(pid)+'/fd';
450     if os.access(dir, os.F_OK):
451     if os.access(dir, os.X_OK):
452     list = os.listdir(dir);
453     open_files = len(list);
454     if pid == os.getpid():
455     open_files -= 2;
456     this.logger.log(Logger.DEBUG, "Counting open_files for "+ `pid` +": "+ str(len(list)) +" => " + `open_files` + " open_files");
457     return open_files;
458     else:
459     this.logger.log(Logger.ERROR, "ProcInfo: cannot count the number of opened files for job "+`pid`);
460     else:
461     this.logger.log(Logger.ERROR, "ProcInfo: job "+`pid`+" dosen't exist");
462    
463    
464 corvo 1.1 # if there is an work directory defined, then compute the used space in that directory
465     # and the free disk space on the partition to which that directory belongs
466     # sizes are given in MB
467     def readJobDiskUsage (this, pid):
468 corvo 1.2 if (pid == '') or not this.JOBS.has_key(pid):
469 corvo 1.1 return;
470     workDir = this.JOBS[pid]['WORKDIR'];
471     if workDir == '':
472     return;
473     try:
474 corvo 1.3 DU = os.popen("du -Lsck " + workDir + " | tail -1 | cut -f 1");
475 corvo 1.1 line = DU.readline();
476 corvo 1.3 this.JOBS[pid]['DATA']['workdir_size'] = int(line) / 1024.0;
477 corvo 1.1 except IOError, ex:
478     this.logger.log(Logger.ERROR, "ERROR", "ProcInfo: cannot run du to get job's disk usage for job "+`pid`);
479     try:
480 corvo 1.3 DF = os.popen("df -k "+workDir+" | tail -1");
481 corvo 1.1 line = DF.readline().strip();
482     m = re.match("\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)%", line);
483     if m != None:
484 corvo 1.3 this.JOBS[pid]['DATA']['disk_total'] = float(m.group(1)) / 1024.0;
485     this.JOBS[pid]['DATA']['disk_used'] = float(m.group(2)) / 1024.0;
486     this.JOBS[pid]['DATA']['disk_free'] = float(m.group(3)) / 1024.0;
487     this.JOBS[pid]['DATA']['disk_usage'] = float(m.group(4)) / 1024.0;
488 corvo 1.1 DF.close();
489     except IOError, ex:
490     this.logger.log(Logger.ERROR, "ERROR", "ProcInfo: cannot run df to get job's disk usage for job "+`pid`);
491    
492 corvo 1.3 # create cummulative parameters based on raw params like cpu_, pages_, swap_, or ethX_
493     def computeCummulativeParams(this, dataRef, prevDataRef):
494     if prevDataRef == {}:
495     for key in dataRef.keys():
496     if key.find('raw_') == 0:
497     prevDataRef[key] = dataRef[key];
498     prevDataRef['TIME'] = dataRef['TIME'];
499     return;
500    
501     # cpu -related params
502     if (dataRef.has_key('raw_cpu_usr')) and (prevDataRef.has_key('raw_cpu_usr')):
503     diff={};
504     cpu_sum = 0;
505     for param in ['cpu_usr', 'cpu_nice', 'cpu_sys', 'cpu_idle']:
506     diff[param] = this.diffWithOverflowCheck(dataRef['raw_'+param], prevDataRef['raw_'+param]);
507     cpu_sum += diff[param];
508     for param in ['cpu_usr', 'cpu_nice', 'cpu_sys', 'cpu_idle']:
509     if cpu_sum != 0:
510     dataRef[param] = 100.0 * diff[param] / cpu_sum;
511     else:
512     del dataRef[param];
513     if cpu_sum != 0:
514     dataRef['cpu_usage'] = 100.0 * (cpu_sum - diff['cpu_idle']) / cpu_sum;
515     else:
516     del dataRef['cpu_usage'];
517    
518     # swap & pages -related params
519     if (dataRef.has_key('raw_pages_in')) and (prevDataRef.has_key('raw_pages_in')):
520     interval = dataRef['TIME'] - prevDataRef['TIME'];
521     for param in ['pages_in', 'pages_out', 'swap_in', 'swap_out']:
522     diff = this.diffWithOverflowCheck(dataRef['raw_'+param], prevDataRef['raw_'+param]);
523     if interval != 0:
524     dataRef[param] = 1000.0 * diff / interval;
525     else:
526     del dataRef[param];
527    
528     # eth - related params
529     interval = dataRef['TIME'] - prevDataRef['TIME'];
530     for rawParam in dataRef.keys():
531     if (rawParam.find('raw_eth') == 0) and prevDataRef.has_key(rawParam):
532     param = rawParam.split('raw_')[1];
533     if interval != 0:
534     dataRef[param] = this.diffWithOverflowCheck(dataRef[rawParam], prevDataRef[rawParam]); # absolute difference
535     if param.find('_errs') == -1:
536     dataRef[param] = dataRef[param] / interval / 1024.0; # if it's _in or _out, compute in KB/sec
537     else:
538     del dataRef[param];
539    
540     # copy contents of the current data values to the
541     for param in dataRef.keys():
542     if param.find('raw_') == 0:
543     prevDataRef[param] = dataRef[param];
544     prevDataRef['TIME'] = dataRef['TIME'];
545    
546    
547 corvo 1.1 # Return a hash containing (param,value) pairs with existing values from the requested ones
548 corvo 1.3 def getFilteredData (this, dataHash, paramsList, prevDataHash = None):
549    
550     if not prevDataHash is None:
551     this.computeCummulativeParams(dataHash, prevDataHash);
552    
553 corvo 1.1 result = {};
554     for param in paramsList:
555 corvo 1.3 if param == 'net_sockets':
556     for key in dataHash.keys():
557     if key.find('sockets') == 0 and key.find('sockets_tcp_') == -1:
558     result[key] = dataHash[key];
559     elif param == 'net_tcp_details':
560     for key in dataHash.keys():
561     if key.find('sockets_tcp_') == 0:
562     result[key] = dataHash[key];
563    
564 corvo 1.1 m = re.match("^net_(.*)$", param);
565     if m == None:
566     m = re.match("^(ip)$", param);
567     if m != None:
568     net_param = m.group(1);
569     #this.logger.log(Logger.DEBUG, "Querying param "+net_param);
570     for key, value in dataHash.items():
571     m = re.match("eth\d_"+net_param, key);
572     if m != None:
573     result[key] = value;
574     else:
575 corvo 1.3 if param == 'processes':
576     for key in dataHash.keys():
577     if key.find('processes') == 0:
578     result[key] = dataHash[key];
579     elif dataHash.has_key(param):
580 corvo 1.1 result[param] = dataHash[param];
581 corvo 1.3 sorted_result = [];
582     keys = result.keys();
583     keys.sort();
584     for key in keys:
585     sorted_result.append((key,result[key]));
586     return sorted_result;
587 corvo 1.1
588     ######################################################################################
589     # self test
590    
591     if __name__ == '__main__':
592 corvo 1.2 logger = Logger.Logger(Logger.DEBUG);
593 corvo 1.1 pi = ProcInfo(logger);
594 corvo 1.2
595 corvo 1.1 print "first update";
596     pi.update();
597     print "Sleeping to accumulate";
598     time.sleep(1);
599     pi.update();
600 corvo 1.2
601 corvo 1.1 print "System Monitoring:";
602     sys_cpu_params = ['cpu_usr', 'cpu_sys', 'cpu_idle', 'cpu_nice', 'cpu_usage'];
603     sys_2_4_params = ['pages_in', 'pages_out', 'swap_in', 'swap_out'];
604     sys_mem_params = ['mem_used', 'mem_free', 'total_mem', 'mem_usage'];
605     sys_swap_params = ['swap_used', 'swap_free', 'total_swap', 'swap_usage'];
606     sys_load_params = ['load1', 'load5', 'load15', 'processes', 'uptime'];
607 corvo 1.2 sys_gen_params = ['hostname', 'cpu_MHz', 'no_CPUs', 'cpu_vendor_id', 'cpu_family', 'cpu_model', 'cpu_model_name', 'bogomips'];
608 corvo 1.1 sys_net_params = ['net_in', 'net_out', 'net_errs', 'ip'];
609 corvo 1.3 sys_net_stat = ['sockets_tcp', 'sockets_udp', 'sockets_unix', 'sockets_icm'];
610     sys_tcp_details = ['sockets_tcp_ESTABLISHED', 'sockets_tcp_SYN_SENT', 'sockets_tcp_SYN_RECV', 'sockets_tcp_FIN_WAIT1', 'sockets_tcp_FIN_WAIT2', 'sockets_tcp_TIME_WAIT', 'sockets_tcp_CLOSED', 'sockets_tcp_CLOSE_WAIT', 'sockets_tcp_LAST_ACK', 'sockets_tcp_LISTEN', 'sockets_tcp_CLOSING', 'sockets_tcp_UNKNOWN'];
611 corvo 1.1
612     print "sys_cpu_params", pi.getSystemData(sys_cpu_params);
613     print "sys_2_4_params", pi.getSystemData(sys_2_4_params);
614     print "sys_mem_params", pi.getSystemData(sys_mem_params);
615     print "sys_swap_params", pi.getSystemData(sys_swap_params);
616     print "sys_load_params", pi.getSystemData(sys_load_params);
617     print "sys_gen_params", pi.getSystemData(sys_gen_params);
618     print "sys_net_params", pi.getSystemData(sys_net_params);
619 corvo 1.3 print "sys_net_stat", pi.getSystemData(sys_net_stat);
620     print "sys_tcp_details", pi.getSystemData(sys_tcp_details);
621 corvo 1.1
622 corvo 1.2 job_pid = os.getpid();
623    
624 corvo 1.1 print "Job (mysefl) monitoring:";
625 corvo 1.2 pi.addJobToMonitor(job_pid, os.getcwd());
626 corvo 1.1 print "Sleep another second";
627     time.sleep(1);
628     pi.update();
629    
630     job_cpu_params = ['run_time', 'cpu_time', 'cpu_usage'];
631 corvo 1.2 job_mem_params = ['mem_usage', 'rss', 'virtualmem', 'open_files'];
632 corvo 1.1 job_disk_params = ['workdir_size', 'disk_used', 'disk_free', 'disk_total', 'disk_usage'];
633 corvo 1.2 time.sleep(10);
634     print "job_cpu_params", pi.getJobData(job_pid, job_cpu_params);
635     print "job_mem_params", pi.getJobData(job_pid, job_mem_params);
636     print "job_disk_params", pi.getJobData(job_pid, job_disk_params);
637 corvo 1.1
638     pi.removeJobToMonitor(os.getpid());