ViewVC Help
View File | Revision Log | Show Annotations | Root Listing
root/cvsroot/COMP/CRAB/python/ProcInfo.py
Revision: 1.4
Committed: Fri Mar 28 17:18:46 2008 UTC (17 years, 1 month ago) by slacapra
Content type: text/x-python
Branch: MAIN
CVS Tags: CRAB_2_9_1, CRAB_2_9_1_pre2, CRAB_2_9_1_pre1, CRAB_2_9_0, CRAB_2_9_0_pre2, CRAB_2_9_0_pre1, CRAB_2_8_8, CRAB_2_8_8_pre1, CRAB_2_8_7_patch3, CRAB_2_8_7_patch2, CRAB_2_8_7_patch1, CRAB_2_8_7, CRAB_2_8_7_pre2, CRAB_2_8_7_pre1, CRAB_2_8_6, CRAB_2_8_6_pre1, CRAB_2_8_5_patch3, CRAB_2_8_5_patch2, CRAB_2_8_5_patch1, CRAB_2_8_5, CRAB_2_8_5_pre5, CRAB_2_8_5_pre4, CRAB_2_8_5_pre3, CRAB_2_8_4_patch3, CRAB_2_8_5_pre2, CRAB_2_8_4_patch2, CRAB_2_8_5_pre1, CRAB_2_8_4_patch1, CRAB_2_8_4, CRAB_2_8_4_pre5, CRAB_2_8_4_pre4, CRAB_2_8_4_pre3, CRAB_2_8_4_pre2, CRAB_2_8_4_pre1, CRAB_2_8_3, CRAB_2_8_3_pre4, CRAB_2_8_3_pre3, CRAB_2_8_3_pre2, CRAB_2_8_3_pre1, CRAB_2_8_2_patch1, CRAB_2_8_2, CRAB_2_8_2_pre5, CRAB_2_8_2_pre4, CRAB_2_8_2_pre3, CRAB_2_8_2_pre2, CRAB_2_8_2_pre1, CRAB_2_8_1, CRAB_2_8_0, CRAB_2_8_0_pre1, CRAB_2_7_10_pre3, CRAB_2_7_9_patch2_pre1, CRAB_2_7_10_pre2, CRAB_2_7_10_pre1, CRAB_2_7_9_patch1, CRAB_2_7_9, CRAB_2_7_9_pre5, CRAB_2_7_9_pre4, CRAB_2_7_9_pre3, CRAB_2_7_9_pre2, CRAB_2_7_8_patch2, CRAB_2_7_9_pre1, CRAB_2_7_8_patch2_pre1, CRAB_2_7_8_patch1, CRAB_2_7_8_patch1_pre1, CRAB_2_7_8, CRAB_2_7_8_pre3, CRAB_2_7_8_pre2, CRAB_2_7_8_dash3, CRAB_2_7_8_dash2, CRAB_2_7_8_dash, CRAB_2_7_7_patch1, CRAB_2_7_7_patch1_pre1, CRAB_2_7_8_pre1, CRAB_2_7_7, CRAB_2_7_7_pre2, CRAB_2_7_7_pre1, CRAB_2_7_6_patch1, CRAB_2_7_6, CRAB_2_7_6_pre1, CRAB_2_7_5_patch1, CRAB_2_7_5, CRAB_2_7_5_pre3, CRAB_2_7_5_pre2, CRAB_2_7_5_pre1, CRAB_2_7_4_patch1, CRAB_2_7_4, CRAB_2_7_4_pre6, CRAB_2_7_4_pre5, CRAB_2_7_4_pre4, CRAB_2_7_4_pre3, CRAB_2_7_4_pre2, CRAB_2_7_4_pre1, CRAB_2_7_3, CRAB_2_7_3_pre3, CRAB_2_7_3_pre3_beta, CRAB_2_7_3_pre2, CRAB_2_7_3_pre2_beta, CRAB_2_7_3_pre1, CRAB_2_7_3_beta3, CRAB_2_7_3_beta2, CRAB_2_7_3_beta1, CRAB_2_7_3_beta, CRAB_2_7_2_p1, CRAB_2_7_1_branch_firstMERGE, CRAB_2_7_2, CRAB_2_7_2_pre4, CRAB_2_7_2_pre3, CRAB_2_7_2_pre2, CRAB_2_7_2_pre1, CRAB_2_7_1, fede_170310, CRAB_2_7_1_pre12, CRAB_2_7_1_pre11, CRAB_2_7_1_pre10, CRAB_2_7_1_pre9, CRAB_LumiMask, CRAB_2_7_lumi, from_LimiMask, CRAB_2_7_1_pre8, CRAB_2_7_1_pre6, CRAB_2_7_1_pre5, CRAB_2_7_1_wmbs_pre4, CRAB_2_7_1_pre4, CRAB_2_7_1_pre3, CRAB_2_6_6_pre6, CRAB_2_7_1_pre2, CRAB_2_6_6_pre5, CRAB_2_7_1_pre1, CRAB_2_6_6_pre4, CRAB_2_6_6_pre3, CRAB_2_6_6_pre2, CRAB_2_6_6_check, CRAB_2_6_6, CRAB_2_6_6_pre1, CRAB_2_7_0, CRAB_2_6_5, CRAB_2_7_0_pre8, CRAB_2_6_5_pre1, CRAB_2_7_0_pre7, CRAB_2_6_4, CRAB_2_7_0_pre6, CRAB_2_6_4_pre1, CRAB_2_7_0_pre5, CRAB_2_6_3_patch_2, CRAB_2_6_3_patch_2_pre2, CRAB_2_6_3_patch_2_pre1, CRAB_2_6_3_patch_1, CRAB_2_7_0_pre4, CRAB_2_7_0_pre3, CRAB_2_6_3, CRAB_2_6_3_pre5, CRAB_2_6_3_pre4, CRAB_2_6_3_pre3, CRAB_2_6_3_pre2, CRAB_2_7_0_pre2, CRAB_2_6_3_pre1, test_1, CRAB_2_7_0_pre1, CRAB_2_6_2, CRAB_2_6_2_pre2, CRAB_2_6_2_pre1, CRAB_2_6_1_pre4, CRAB_2_6_1_pre3, CRAB_2_6_1_pre2, CRAB_2_6_1_pre1, CRAB_2_6_1, CRAB_2_6_0, CRAB_2_6_0_pre14, CRAB_2_6_0_pre13, CRAB_2_6_0_pre12, CRAB_2_6_0_pre11, CRAB_2_6_0_pre10, CRAB_2_6_0_pre9, CRAB_2_6_0_pre8, CRAB_2_6_0_pre7, CRAB_2_6_0_pre6, CRAB_2_6_0_pre5, CRAB_2_6_0_pre4, CRAB_2_6_0_pre3, CRAB_2_6_0_pre2, CRAB_2_6_0_pre1, CRAB_2_5_1, CRAB_2_5_1_pre4, CRAB_2_5_1_pre3, CRAB_2_5_1_pre2, CRAB_2_5_1_pre1, CRAB_2_5_0, CRAB_2_5_0_pre7, CRAB_2_5_0_pre6, CRAB_2_5_0_pre5, CRAB_2_5_0_pre4, CRAB_2_5_0_pre3, CRAB_2_5_0_pre2, CRAB_2_5_0_pre1, CRAB_2_4_4, CRAB_2_4_4_pre6, CRAB_2_4_4_pre5, CRAB_2_4_4_pre4, CRAB_2_4_4_pre3, CRAB_2_4_4_pre2, CRAB_2_4_4_pre1, CRAB_2_4_3, CRAB_2_4_3_pre8, CRAB_2_4_3_pre7, CRAB_2_4_3_pre6, CRAB_2_4_3_pre5, CRAB_2_4_3_pre3, CRAB_2_4_3_pre2, CRAB_2_4_3_pre1, CRAB_2_4_2, CRAB_2_4_2_pre3, CRAB_2_4_2_pre2, CRAB_2_4_2_pre1, CRAB_2_4_1, CRAB_2_4_1_pre4, CRAB_2_4_1_pre3, CRAB_2_4_1_pre2, CRAB_2_4_1_pre1, CRAB_2_4_0_Tutorial, CRAB_2_4_0_Tutorial_pre1, CRAB_2_4_0, CRAB_2_4_0_pre9, CRAB_2_4_0_pre8, CRAB_2_4_0_pre7, CRAB_2_4_0_pre6, CRAB_2_4_0_pre5, CRAB_2_4_0_pre4, CRAB_2_4_0_pre3, CRAB_2_4_0_pre2, CRAB_2_4_0_pre1, CRAB_DLS_PHED1, CRAB_DLS_PHED, CRAB_2_3_2_Fnal, CRAB_2_3_2, CRAB_2_3_2_pre7, CRAB_2_3_2_pre5, CRAB_2_3_2_pre4, CRAB_2_3_2_pre3, CRAB_2_3_2_pre2, CRAB_2_3_2_pre1, CRAB_2_4_0_test, CRAB_2_3_1, CRAB_2_3_1_pre6, CRAB_2_3_1_pre5, CRAB_2_3_1_pre4, CRAB_2_3_1_pre3, CRAB_2_3_1_pre2, CRAB_2_3_1_pre1, CRAB_2_3_0, CRAB_2_3_0_pre6, CRAB_2_3_0_pre1, CRAB_2_2_2_pre5, CRAB_2_2_2_pre4, CRAB_2_2_2_pre3, CRAB_2_2_2_pre2, CRAB_2_2_2_pre1, CRAB_2_2_1, CRAB_2_2_1_pre6, CRAB_2_2_1_pre5, CRAB_2_2_1_pre4, PRODCOMMON_0_10_7_testCS2, CRAB_2_2_1_pre3, CRAB_2_2_1_pre2, CRAB_2_2_1_pre1, CRAB_2_2_0, CRAB_2_2_0_pre21, CRAB_2_2_0_pre19, CRAB_2_2_0_pre18, CRAB_2_2_0_pre17, CRAB_2_2_0_pre16, CRAB_2_2_0_pre15, CRAB_2_2_0_pre13, CRAB_2_2_0_pre12, CRAB_2_2_0_pre11, CRAB_2_2_0_pre10, bp_osg_bdii, CRAB_2_2_0_pre9, CRAB_2_2_0_pre8, CRAB_2_2_0_pre7, CRAB_2_1_2, CRAB_2_2_0_pre5, CRAB_2_1_2_pre2, CRAB_2_1_2_pre1, CRAB_2_2_0_pre4, CRAB_2_2_0_pre2, CRAB_2_1_1, HEAD
Branch point for: CRAB_multiout, CRAB_2_7_1_branch, Lumi2_8, CRAB_2_6_X_br, AnaDataSet, CRAB_2_3_0_br, osg_bdii, CRAB_2_1_2_br
Changes since 1.3: +1 -1 lines
Log Message:
Upgrade of dashboard related classes to improve submission of monotoring infos for larg task: see https://hypernews.cern.ch/HyperNews/CMS/get/crabDevelopment/657/1/1/1/2/1/1.html

File Contents

# Content
1
2 """
3 * ApMon - Application Monitoring Tool
4 * Version: 2.2.1
5 *
6 * Copyright (C) 2006 California Institute of Technology
7 *
8 * Permission is hereby granted, free of charge, to use, copy and modify
9 * this software and its documentation (the "Software") for any
10 * purpose, provided that existing copyright notices are retained in
11 * all copies and that this notice is included verbatim in any distributions
12 * or substantial portions of the Software.
13 * This software is a part of the MonALISA framework (http://monalisa.cacr.caltech.edu).
14 * Users of the Software are asked to feed back problems, benefits,
15 * and/or suggestions about the software to the MonALISA Development Team
16 * (developers@monalisa.cern.ch). Support for this software - fixing of bugs,
17 * incorporation of new features - is done on a best effort basis. All bug
18 * fixes and enhancements will be made available under the same terms and
19 * conditions as the original software,
20
21 * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY FOR
22 * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT
23 * OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY DERIVATIVES THEREOF,
24 * EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26 * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
27 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
28 * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE IS
29 * PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE NO
30 * OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
31 * MODIFICATIONS.
32 """
33
34
35 import os
36 import re
37 import time
38 import string
39 import socket
40 import Logger
41
42 """
43 Class ProcInfo
44 extracts information from the proc/ filesystem for system and job monitoring
45 """
46 class ProcInfo:
47 # ProcInfo constructor
48 def __init__ (this, logger):
49 this.DATA = {}; # monitored data that is going to be reported
50 this.LAST_UPDATE_TIME = 0; # when the last measurement was done
51 this.JOBS = {}; # jobs that will be monitored
52 this.logger = logger # use the given logger
53 this.OS_TYPE = os.popen('uname -s').readline().replace('\n','');
54
55 # This should be called from time to time to update the monitored data,
56 # but not more often than once a second because of the resolution of time()
57 def update (this):
58 if this.LAST_UPDATE_TIME == int(time.time()):
59 this.logger.log(Logger.NOTICE, "ProcInfo: update() called too often!");
60 return;
61 this.readStat();
62 this.readMemInfo();
63 if this.OS_TYPE == 'Darwin':
64 this.darwin_readLoadAvg();
65 else:
66 this.readLoadAvg();
67 this.countProcesses();
68 this.readGenericInfo();
69 this.readNetworkInfo();
70 this.readNetStat();
71 for pid in this.JOBS.keys():
72 this.readJobInfo(pid);
73 this.readJobDiskUsage(pid);
74 this.LAST_UPDATE_TIME = int(time.time());
75 this.DATA['TIME'] = int(time.time());
76
77 # Call this to add another PID to be monitored
78 def addJobToMonitor (this, pid, workDir):
79 this.JOBS[pid] = {};
80 this.JOBS[pid]['WORKDIR'] = workDir;
81 this.JOBS[pid]['DATA'] = {};
82 #print this.JOBS;
83
84 # Call this to stop monitoring a PID
85 def removeJobToMonitor (this, pid):
86 if this.JOBS.has_key(pid):
87 del this.JOBS[pid];
88
89 # Return a filtered hash containting the system-related parameters and values
90 def getSystemData (this, params, prevDataRef):
91 return this.getFilteredData(this.DATA, params, prevDataRef);
92
93 # Return a filtered hash containing the job-related parameters and values
94 def getJobData (this, pid, params):
95 if not this.JOBS.has_key(pid):
96 return [];
97 return this.getFilteredData(this.JOBS[pid]['DATA'], params);
98
99 ############################################################################################
100 # internal functions for system monitoring
101 ############################################################################################
102
103 # this has to be run twice (with the $lastUpdateTime updated) to get some useful results
104 # the information about pages_in/out and swap_in/out isn't available for 2.6 kernels (yet)
105 def readStat (this):
106 try:
107 FSTAT = open('/proc/stat');
108 line = FSTAT.readline();
109 while(line != ''):
110 if(line.startswith("cpu ")):
111 elem = re.split("\s+", line);
112 this.DATA['raw_cpu_usr'] = float(elem[1]);
113 this.DATA['raw_cpu_nice'] = float(elem[2]);
114 this.DATA['raw_cpu_sys'] = float(elem[3]);
115 this.DATA['raw_cpu_idle'] = float(elem[4]);
116 if(line.startswith("page")):
117 elem = line.split();
118 this.DATA['raw_pages_in'] = float(elem[1]);
119 this.DATA['raw_pages_out'] = float(elem[2]);
120 if(line.startswith('swap')):
121 elem = line.split();
122 this.DATA['raw_swap_in'] = float(elem[1]);
123 this.DATA['raw_swap_out'] = float(elem[2]);
124 line = FSTAT.readline();
125 FSTAT.close();
126 except IOError, ex:
127 this.logger.log(Logger.ERROR, "ProcInfo: cannot open /proc/stat");
128 return;
129
130 # sizes are reported in MB (except _usage that is in percent).
131 def readMemInfo (this):
132 try:
133 FMEM = open('/proc/meminfo');
134 line = FMEM.readline();
135 while(line != ''):
136 elem = re.split("\s+", line);
137 if(line.startswith("MemFree:")):
138 this.DATA['mem_free'] = float(elem[1]) / 1024.0;
139 if(line.startswith("MemTotal:")):
140 this.DATA['total_mem'] = float(elem[1]) / 1024.0;
141 if(line.startswith("SwapFree:")):
142 this.DATA['swap_free'] = float(elem[1]) / 1024.0;
143 if(line.startswith("SwapTotal:")):
144 this.DATA['total_swap'] = float(elem[1]) / 1024.0;
145 line = FMEM.readline();
146 FMEM.close();
147 if this.DATA.has_key('total_mem') and this.DATA.has_key('mem_free'):
148 this.DATA['mem_used'] = this.DATA['total_mem'] - this.DATA['mem_free'];
149 if this.DATA.has_key('total_swap') and this.DATA.has_key('swap_free'):
150 this.DATA['swap_used'] = this.DATA['total_swap'] - this.DATA['swap_free'];
151 if this.DATA.has_key('mem_used') and this.DATA.has_key('total_mem') and this.DATA['total_mem'] > 0:
152 this.DATA['mem_usage'] = 100.0 * this.DATA['mem_used'] / this.DATA['total_mem'];
153 if this.DATA.has_key('swap_used') and this.DATA.has_key('total_swap') and this.DATA['total_swap'] > 0:
154 this.DATA['swap_usage'] = 100.0 * this.DATA['swap_used'] / this.DATA['total_swap'];
155 except IOError, ex:
156 this.logger.log(Logger.ERROR, "ProcInfo: cannot open /proc/meminfo");
157 return;
158
159 # read system load average
160 def readLoadAvg (this):
161 try:
162 FAVG = open('/proc/loadavg');
163 line = FAVG.readline();
164 FAVG.close();
165 elem = re.split("\s+", line);
166 this.DATA['load1'] = float(elem[0]);
167 this.DATA['load5'] = float(elem[1]);
168 this.DATA['load15'] = float(elem[2]);
169 except IOError, ex:
170 this.logger.log(Logger.ERROR, "ProcInfo: cannot open /proc/meminfo");
171 return;
172
173
174 # read system load average on Darwin
175 def darwin_readLoadAvg (this):
176 try:
177 LOAD_AVG = os.popen('sysctl vm.loadavg');
178 line = LOAD_AVG.readline();
179 LOAD_AVG.close();
180 elem = re.split("\s+", line);
181 this.DATA['load1'] = float(elem[1]);
182 this.DATA['load5'] = float(elem[2]);
183 this.DATA['load15'] = float(elem[3]);
184 except IOError, ex:
185 this.logger.log(Logger.ERROR, "ProcInfo: cannot run 'sysctl vm.loadavg");
186 return;
187
188
189 # read the number of processes currently running on the system
190 def countProcesses (this):
191 """
192 # old version
193 nr = 0;
194 try:
195 for file in os.listdir("/proc"):
196 if re.match("\d+", file):
197 nr += 1;
198 this.DATA['processes'] = nr;
199 except IOError, ex:
200 this.logger.log(Logger.ERROR, "ProcInfo: cannot open /proc to count processes");
201 return;
202 """
203 # new version
204 total = 0;
205 states = {'D':0, 'R':0, 'S':0, 'T':0, 'Z':0};
206 try:
207 output = os.popen('ps -A -o state');
208 line = output.readline();
209 while(line != ''):
210 states[line[0]] = states[line[0]] + 1;
211 total = total + 1;
212 line = output.readline();
213 output.close();
214 this.DATA['processes'] = total;
215 for key in states.keys():
216 this.DATA['processes_'+key] = states[key];
217 except IOError, ex:
218 this.logger.log(Logger.ERROR, "ProcInfo: cannot get output from ps command");
219 return;
220
221 # reads the IP, hostname, cpu_MHz, uptime
222 def readGenericInfo (this):
223 this.DATA['hostname'] = socket.getfqdn();
224 try:
225 output = os.popen('/sbin/ifconfig -a')
226 eth, ip = '', '';
227 line = output.readline();
228 while(line != ''):
229 line = line.strip();
230 if line.startswith("eth"):
231 elem = line.split();
232 eth = elem[0];
233 ip = '';
234 if len(eth) > 0 and line.startswith("inet addr:"):
235 ip = re.match("inet addr:(\d+\.\d+\.\d+\.\d+)", line).group(1);
236 this.DATA[eth + '_ip'] = ip;
237 eth = '';
238 line = output.readline();
239 output.close();
240 except IOError, ex:
241 this.logger.log(Logger.ERROR, "ProcInfo: cannot get output from /sbin/ifconfig -a");
242 return;
243 try:
244 no_cpus = 0;
245 FCPU = open('/proc/cpuinfo');
246 line = FCPU.readline();
247 while(line != ''):
248 if line.startswith("cpu MHz"):
249 this.DATA['cpu_MHz'] = float(re.match("cpu MHz\s+:\s+(\d+\.?\d*)", line).group(1));
250 no_cpus += 1;
251
252 if line.startswith("vendor_id"):
253 this.DATA['cpu_vendor_id'] = re.match("vendor_id\s+:\s+(.+)", line).group(1);
254
255 if line.startswith("cpu family"):
256 this.DATA['cpu_family'] = re.match("cpu family\s+:\s+(.+)", line).group(1);
257
258 if line.startswith("model") and not line.startswith("model name") :
259 this.DATA['cpu_model'] = re.match("model\s+:\s+(.+)", line).group(1);
260
261 if line.startswith("model name"):
262 this.DATA['cpu_model_name'] = re.match("model name\s+:\s+(.+)", line).group(1);
263
264 if line.startswith("bogomips"):
265 this.DATA['bogomips'] = float(re.match("bogomips\s+:\s+(\d+\.?\d*)", line).group(1));
266
267 line = FCPU.readline();
268 FCPU.close();
269 this.DATA['no_CPUs'] = no_cpus;
270 except IOError, ex:
271 this.logger.log(Logger.ERROR, "ProcInfo: cannot open /proc/cpuinfo");
272 return;
273 try:
274 FUPT = open('/proc/uptime');
275 line = FUPT.readline();
276 FUPT.close();
277 elem = line.split();
278 this.DATA['uptime'] = float(elem[0]) / (24.0 * 3600);
279 except IOError, ex:
280 this.logger.log(Logger.ERROR, "ProcInfo: cannot open /proc/uptime");
281 return;
282
283 # do a difference with overflow check and repair
284 # the counter is unsigned 32 or 64 bit
285 def diffWithOverflowCheck(this, new, old):
286 if new >= old:
287 return new - old;
288 else:
289 max = (1L << 31) * 2; # 32 bits
290 if old >= max:
291 max = (1L << 63) * 2; # 64 bits
292 return new - old + max;
293
294 # read network information like transfered kBps and nr. of errors on each interface
295 def readNetworkInfo (this):
296 try:
297 FNET = open('/proc/net/dev');
298 line = FNET.readline();
299 while(line != ''):
300 m = re.match("\s*eth(\d):(\d+)\s+\d+\s+(\d+)\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+(\d+)\s+\d+\s+(\d+)", line);
301 if m != None:
302 this.DATA['raw_eth'+m.group(1)+'_in'] = float(m.group(2));
303 this.DATA['raw_eth'+m.group(1)+'_out'] = float(m.group(4));
304 this.DATA['raw_eth'+m.group(1)+'_errs'] = int(m.group(3)) + int(m.group(5));
305 line = FNET.readline();
306 FNET.close();
307 except IOError, ex:
308 this.logger.log(Logger.ERROR, "ProcInfo: cannot open /proc/net/dev");
309 return;
310
311 # run nestat and collect sockets info (tcp, udp, unix) and connection states for tcp sockets from netstat
312 def readNetStat(this):
313 try:
314 output = os.popen('netstat -an 2>/dev/null');
315 sockets = { 'sockets_tcp':0, 'sockets_udp':0, 'sockets_unix':0, 'sockets_icm':0 };
316 tcp_details = { 'sockets_tcp_ESTABLISHED':0, 'sockets_tcp_SYN_SENT':0,
317 'sockets_tcp_SYN_RECV':0, 'sockets_tcp_FIN_WAIT1':0, 'sockets_tcp_FIN_WAIT2':0,
318 'sockets_tcp_TIME_WAIT':0, 'sockets_tcp_CLOSED':0, 'sockets_tcp_CLOSE_WAIT':0,
319 'sockets_tcp_LAST_ACK':0, 'sockets_tcp_LISTEN':0, 'sockets_tcp_CLOSING':0,
320 'sockets_tcp_UNKNOWN':0 };
321 line = output.readline();
322 while(line != ''):
323 arg = string.split(line);
324 proto = arg[0];
325 if proto.find('tcp') == 0:
326 sockets['sockets_tcp'] += 1;
327 state = arg[len(arg)-1];
328 key = 'sockets_tcp_'+state;
329 if tcp_details.has_key(key):
330 tcp_details[key] += 1;
331 if proto.find('udp') == 0:
332 sockets['sockets_udp'] += 1;
333 if proto.find('unix') == 0:
334 sockets['sockets_unix'] += 1;
335 if proto.find('icm') == 0:
336 sockets['sockets_icm'] += 1;
337
338 line = output.readline();
339 output.close();
340
341 for key in sockets.keys():
342 this.DATA[key] = sockets[key];
343 for key in tcp_details.keys():
344 this.DATA[key] = tcp_details[key];
345 except IOError, ex:
346 this.logger.log(Logger.ERROR, "ProcInfo: cannot get output from netstat command");
347 return;
348
349 ##############################################################################################
350 # job monitoring related functions
351 ##############################################################################################
352
353 # internal function that gets the full list of children (pids) for a process (pid)
354 def getChildren (this, parent):
355 pidmap = {};
356 try:
357 output = os.popen('ps -A -o "pid ppid"');
358 line = output.readline(); # skip headers
359 line = output.readline();
360 while(line != ''):
361 line = line.strip();
362 elem = re.split("\s+", line);
363 pidmap[elem[0]] = elem[1];
364 line = output.readline();
365 output.close();
366 except IOError, ex:
367 this.logger.log(Logger.ERROR, "ProcInfo: cannot execute ps -A -o \"pid ppid\"");
368
369 if pidmap.has_key(parent):
370 this.logger.log(Logger.INFO, 'ProcInfo: No job with pid='+str(parent));
371 this.removeJobToMonitor(parent);
372 return [];
373
374 children = [parent];
375 i = 0;
376 while(i < len(children)):
377 prnt = children[i];
378 for (pid, ppid) in pidmap.items():
379 if ppid == prnt:
380 children.append(pid);
381 i += 1;
382 return children;
383
384 # internal function that parses a time formatted like "days-hours:min:sec" and returns the corresponding
385 # number of seconds.
386 def parsePSTime (this, my_time):
387 my_time = my_time.strip();
388 m = re.match("(\d+)-(\d+):(\d+):(\d+)", my_time);
389 if m != None:
390 return int(m.group(1)) * 24 * 3600 + int(m.group(2)) * 3600 + int(m.group(3)) * 60 + int(m.group(4));
391 else:
392 m = re.match("(\d+):(\d+):(\d+)", my_time);
393 if(m != None):
394 return int(m.group(1)) * 3600 + int(m.group(2)) * 60 + int(m.group(3));
395 else:
396 m = re.match("(\d+):(\d+)", my_time);
397 if(m != None):
398 return int(m.group(1)) * 60 + int(m.group(2));
399 else:
400 return 0;
401
402 # read information about this the JOB_PID process
403 # memory sizes are given in KB
404 def readJobInfo (this, pid):
405 if (pid == '') or not this.JOBS.has_key(pid):
406 return;
407 children = this.getChildren(pid);
408 if(len(children) == 0):
409 this.logger.log(Logger.INFO, "ProcInfo: Job with pid="+str(pid)+" terminated; removing it from monitored jobs.");
410 #print ":("
411 this.removeJobToMonitor(pid);
412 return;
413 try:
414 JSTATUS = os.popen("ps --no-headers --pid " + ",".join([`child` for child in children]) + " -o pid,etime,time,%cpu,%mem,rsz,vsz,comm");
415 mem_cmd_map = {};
416 etime, cputime, pcpu, pmem, rsz, vsz, comm, fd = 0, 0, 0, 0, 0, 0, 0, 0;
417 line = JSTATUS.readline();
418 while(line != ''):
419 line = line.strip();
420 m = re.match("(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(.+)", line);
421 if m != None:
422 apid, etime1, cputime1, pcpu1, pmem1, rsz1, vsz1, comm1 = m.group(1), m.group(2), m.group(3), m.group(4), m.group(5), m.group(6), m.group(7), m.group(8);
423 sec = this.parsePSTime(etime1);
424 if sec > etime: # the elapsed time is the maximum of all elapsed
425 etime = sec;
426 sec = this.parsePSTime(cputime1); # times corespornding to all child processes.
427 cputime += sec; # total cputime is the sum of cputimes for all processes.
428 pcpu += float(pcpu1); # total %cpu is the sum of all children %cpu.
429 if not mem_cmd_map.has_key(`pmem1`+" "+`rsz1`+" "+`vsz1`+" "+`comm1`):
430 # it's the first thread/process with this memory footprint; add it.
431 mem_cmd_map[`pmem1`+" "+`rsz1`+" "+`vsz1`+" "+`comm1`] = 1;
432 pmem += float(pmem1); rsz += int(rsz1); vsz += int(vsz1);
433 fd += this.countOpenFD(apid);
434 # else not adding memory usage
435 line = JSTATUS.readline();
436 JSTATUS.close();
437 this.JOBS[pid]['DATA']['run_time'] = etime;
438 this.JOBS[pid]['DATA']['cpu_time'] = cputime;
439 this.JOBS[pid]['DATA']['cpu_usage'] = pcpu;
440 this.JOBS[pid]['DATA']['mem_usage'] = pmem;
441 this.JOBS[pid]['DATA']['rss'] = rsz;
442 this.JOBS[pid]['DATA']['virtualmem'] = vsz;
443 this.JOBS[pid]['DATA']['open_files'] = fd;
444 except IOError, ex:
445 this.logger.log(Logger.ERROR, "ProcInfo: cannot execute ps --no-headers -eo \"pid ppid\"");
446
447 # count the number of open files for the given pid
448 def countOpenFD (this, pid):
449 dir = '/proc/'+str(pid)+'/fd';
450 if os.access(dir, os.F_OK):
451 if os.access(dir, os.X_OK):
452 list = os.listdir(dir);
453 open_files = len(list);
454 if pid == os.getpid():
455 open_files -= 2;
456 this.logger.log(Logger.DEBUG, "Counting open_files for "+ `pid` +": "+ str(len(list)) +" => " + `open_files` + " open_files");
457 return open_files;
458 else:
459 this.logger.log(Logger.ERROR, "ProcInfo: cannot count the number of opened files for job "+`pid`);
460 else:
461 this.logger.log(Logger.ERROR, "ProcInfo: job "+`pid`+" dosen't exist");
462
463
464 # if there is an work directory defined, then compute the used space in that directory
465 # and the free disk space on the partition to which that directory belongs
466 # sizes are given in MB
467 def readJobDiskUsage (this, pid):
468 if (pid == '') or not this.JOBS.has_key(pid):
469 return;
470 workDir = this.JOBS[pid]['WORKDIR'];
471 if workDir == '':
472 return;
473 try:
474 DU = os.popen("du -Lsck " + workDir + " | tail -1 | cut -f 1");
475 line = DU.readline();
476 this.JOBS[pid]['DATA']['workdir_size'] = int(line) / 1024.0;
477 except IOError, ex:
478 this.logger.log(Logger.ERROR, "ERROR", "ProcInfo: cannot run du to get job's disk usage for job "+`pid`);
479 try:
480 DF = os.popen("df -k "+workDir+" | tail -1");
481 line = DF.readline().strip();
482 m = re.match("\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)%", line);
483 if m != None:
484 this.JOBS[pid]['DATA']['disk_total'] = float(m.group(1)) / 1024.0;
485 this.JOBS[pid]['DATA']['disk_used'] = float(m.group(2)) / 1024.0;
486 this.JOBS[pid]['DATA']['disk_free'] = float(m.group(3)) / 1024.0;
487 this.JOBS[pid]['DATA']['disk_usage'] = float(m.group(4)) / 1024.0;
488 DF.close();
489 except IOError, ex:
490 this.logger.log(Logger.ERROR, "ERROR", "ProcInfo: cannot run df to get job's disk usage for job "+`pid`);
491
492 # create cummulative parameters based on raw params like cpu_, pages_, swap_, or ethX_
493 def computeCummulativeParams(this, dataRef, prevDataRef):
494 if prevDataRef == {}:
495 for key in dataRef.keys():
496 if key.find('raw_') == 0:
497 prevDataRef[key] = dataRef[key];
498 prevDataRef['TIME'] = dataRef['TIME'];
499 return;
500
501 # cpu -related params
502 if (dataRef.has_key('raw_cpu_usr')) and (prevDataRef.has_key('raw_cpu_usr')):
503 diff={};
504 cpu_sum = 0;
505 for param in ['cpu_usr', 'cpu_nice', 'cpu_sys', 'cpu_idle']:
506 diff[param] = this.diffWithOverflowCheck(dataRef['raw_'+param], prevDataRef['raw_'+param]);
507 cpu_sum += diff[param];
508 for param in ['cpu_usr', 'cpu_nice', 'cpu_sys', 'cpu_idle']:
509 if cpu_sum != 0:
510 dataRef[param] = 100.0 * diff[param] / cpu_sum;
511 else:
512 del dataRef[param];
513 if cpu_sum != 0:
514 dataRef['cpu_usage'] = 100.0 * (cpu_sum - diff['cpu_idle']) / cpu_sum;
515 else:
516 del dataRef['cpu_usage'];
517
518 # swap & pages -related params
519 if (dataRef.has_key('raw_pages_in')) and (prevDataRef.has_key('raw_pages_in')):
520 interval = dataRef['TIME'] - prevDataRef['TIME'];
521 for param in ['pages_in', 'pages_out', 'swap_in', 'swap_out']:
522 diff = this.diffWithOverflowCheck(dataRef['raw_'+param], prevDataRef['raw_'+param]);
523 if interval != 0:
524 dataRef[param] = 1000.0 * diff / interval;
525 else:
526 del dataRef[param];
527
528 # eth - related params
529 interval = dataRef['TIME'] - prevDataRef['TIME'];
530 for rawParam in dataRef.keys():
531 if (rawParam.find('raw_eth') == 0) and prevDataRef.has_key(rawParam):
532 param = rawParam.split('raw_')[1];
533 if interval != 0:
534 dataRef[param] = this.diffWithOverflowCheck(dataRef[rawParam], prevDataRef[rawParam]); # absolute difference
535 if param.find('_errs') == -1:
536 dataRef[param] = dataRef[param] / interval / 1024.0; # if it's _in or _out, compute in KB/sec
537 else:
538 del dataRef[param];
539
540 # copy contents of the current data values to the
541 for param in dataRef.keys():
542 if param.find('raw_') == 0:
543 prevDataRef[param] = dataRef[param];
544 prevDataRef['TIME'] = dataRef['TIME'];
545
546
547 # Return a hash containing (param,value) pairs with existing values from the requested ones
548 def getFilteredData (this, dataHash, paramsList, prevDataHash = None):
549
550 if not prevDataHash is None:
551 this.computeCummulativeParams(dataHash, prevDataHash);
552
553 result = {};
554 for param in paramsList:
555 if param == 'net_sockets':
556 for key in dataHash.keys():
557 if key.find('sockets') == 0 and key.find('sockets_tcp_') == -1:
558 result[key] = dataHash[key];
559 elif param == 'net_tcp_details':
560 for key in dataHash.keys():
561 if key.find('sockets_tcp_') == 0:
562 result[key] = dataHash[key];
563
564 m = re.match("^net_(.*)$", param);
565 if m == None:
566 m = re.match("^(ip)$", param);
567 if m != None:
568 net_param = m.group(1);
569 #this.logger.log(Logger.DEBUG, "Querying param "+net_param);
570 for key, value in dataHash.items():
571 m = re.match("eth\d_"+net_param, key);
572 if m != None:
573 result[key] = value;
574 else:
575 if param == 'processes':
576 for key in dataHash.keys():
577 if key.find('processes') == 0:
578 result[key] = dataHash[key];
579 elif dataHash.has_key(param):
580 result[param] = dataHash[param];
581 sorted_result = [];
582 keys = result.keys();
583 keys.sort();
584 for key in keys:
585 sorted_result.append((key,result[key]));
586 return sorted_result;
587
588 ######################################################################################
589 # self test
590
591 if __name__ == '__main__':
592 logger = Logger.Logger(Logger.DEBUG);
593 pi = ProcInfo(logger);
594
595 print "first update";
596 pi.update();
597 print "Sleeping to accumulate";
598 time.sleep(1);
599 pi.update();
600
601 print "System Monitoring:";
602 sys_cpu_params = ['cpu_usr', 'cpu_sys', 'cpu_idle', 'cpu_nice', 'cpu_usage'];
603 sys_2_4_params = ['pages_in', 'pages_out', 'swap_in', 'swap_out'];
604 sys_mem_params = ['mem_used', 'mem_free', 'total_mem', 'mem_usage'];
605 sys_swap_params = ['swap_used', 'swap_free', 'total_swap', 'swap_usage'];
606 sys_load_params = ['load1', 'load5', 'load15', 'processes', 'uptime'];
607 sys_gen_params = ['hostname', 'cpu_MHz', 'no_CPUs', 'cpu_vendor_id', 'cpu_family', 'cpu_model', 'cpu_model_name', 'bogomips'];
608 sys_net_params = ['net_in', 'net_out', 'net_errs', 'ip'];
609 sys_net_stat = ['sockets_tcp', 'sockets_udp', 'sockets_unix', 'sockets_icm'];
610 sys_tcp_details = ['sockets_tcp_ESTABLISHED', 'sockets_tcp_SYN_SENT', 'sockets_tcp_SYN_RECV', 'sockets_tcp_FIN_WAIT1', 'sockets_tcp_FIN_WAIT2', 'sockets_tcp_TIME_WAIT', 'sockets_tcp_CLOSED', 'sockets_tcp_CLOSE_WAIT', 'sockets_tcp_LAST_ACK', 'sockets_tcp_LISTEN', 'sockets_tcp_CLOSING', 'sockets_tcp_UNKNOWN'];
611
612 print "sys_cpu_params", pi.getSystemData(sys_cpu_params);
613 print "sys_2_4_params", pi.getSystemData(sys_2_4_params);
614 print "sys_mem_params", pi.getSystemData(sys_mem_params);
615 print "sys_swap_params", pi.getSystemData(sys_swap_params);
616 print "sys_load_params", pi.getSystemData(sys_load_params);
617 print "sys_gen_params", pi.getSystemData(sys_gen_params);
618 print "sys_net_params", pi.getSystemData(sys_net_params);
619 print "sys_net_stat", pi.getSystemData(sys_net_stat);
620 print "sys_tcp_details", pi.getSystemData(sys_tcp_details);
621
622 job_pid = os.getpid();
623
624 print "Job (mysefl) monitoring:";
625 pi.addJobToMonitor(job_pid, os.getcwd());
626 print "Sleep another second";
627 time.sleep(1);
628 pi.update();
629
630 job_cpu_params = ['run_time', 'cpu_time', 'cpu_usage'];
631 job_mem_params = ['mem_usage', 'rss', 'virtualmem', 'open_files'];
632 job_disk_params = ['workdir_size', 'disk_used', 'disk_free', 'disk_total', 'disk_usage'];
633 time.sleep(10);
634 print "job_cpu_params", pi.getJobData(job_pid, job_cpu_params);
635 print "job_mem_params", pi.getJobData(job_pid, job_mem_params);
636 print "job_disk_params", pi.getJobData(job_pid, job_disk_params);
637
638 pi.removeJobToMonitor(os.getpid());