1 |
corvo |
1.1 |
|
2 |
|
|
"""
|
3 |
|
|
* ApMon - Application Monitoring Tool
|
4 |
|
|
* Version: 2.0.4
|
5 |
|
|
*
|
6 |
|
|
* Copyright (C) 2005 California Institute of Technology
|
7 |
|
|
*
|
8 |
|
|
* Permission is hereby granted, free of charge, to use, copy and modify
|
9 |
|
|
* this software and its documentation (the "Software") for any
|
10 |
|
|
* purpose, provided that existing copyright notices are retained in
|
11 |
|
|
* all copies and that this notice is included verbatim in any distributions
|
12 |
|
|
* or substantial portions of the Software.
|
13 |
|
|
* This software is a part of the MonALISA framework (http://monalisa.cacr.caltech.edu).
|
14 |
|
|
* Users of the Software are asked to feed back problems, benefits,
|
15 |
|
|
* and/or suggestions about the software to the MonALISA Development Team
|
16 |
|
|
* (developers@monalisa.cern.ch). Support for this software - fixing of bugs,
|
17 |
|
|
* incorporation of new features - is done on a best effort basis. All bug
|
18 |
|
|
* fixes and enhancements will be made available under the same terms and
|
19 |
|
|
* conditions as the original software,
|
20 |
|
|
|
21 |
|
|
* IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY FOR
|
22 |
|
|
* DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT
|
23 |
|
|
* OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY DERIVATIVES THEREOF,
|
24 |
|
|
* EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
25 |
|
|
|
26 |
|
|
* THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
|
27 |
|
|
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
|
28 |
|
|
* FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE IS
|
29 |
|
|
* PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE NO
|
30 |
|
|
* OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
|
31 |
|
|
* MODIFICATIONS.
|
32 |
|
|
"""
|
33 |
|
|
|
34 |
|
|
"""
|
35 |
|
|
apmon.py
|
36 |
|
|
|
37 |
|
|
This is a python implementation for the ApMon API for sending
|
38 |
|
|
data to the MonALISA service.
|
39 |
|
|
|
40 |
|
|
For further details about ApMon please see the C/C++ or Java documentation
|
41 |
|
|
You can find a sample usage of this module in apmTest.py.
|
42 |
|
|
|
43 |
|
|
Note that the parameters must be either integers(32 bits) or doubles(64 bits).
|
44 |
|
|
Sending strings is supported, but they will not be stored in the
|
45 |
|
|
farm's store nor shown in the farm's window in the MonALISA client.
|
46 |
|
|
"""
|
47 |
|
|
|
48 |
|
|
import re
|
49 |
|
|
import xdrlib
|
50 |
|
|
import socket
|
51 |
|
|
import urllib2
|
52 |
|
|
import threading
|
53 |
|
|
import time
|
54 |
|
|
import Logger
|
55 |
|
|
import ProcInfo
|
56 |
|
|
|
57 |
|
|
#__all__ = ["ApMon"]
|
58 |
|
|
|
59 |
|
|
#__debug = False # set this to True to be verbose
|
60 |
|
|
|
61 |
|
|
class ApMon:
|
62 |
|
|
"""
|
63 |
|
|
Main class for sending monitoring data to a MonaLisa module.
|
64 |
|
|
One or more destinations can be chosen for the data. See constructor.
|
65 |
|
|
|
66 |
|
|
The data is packed in UDP datagrams, using XDR. The following fields are sent:
|
67 |
|
|
- version & password (string)
|
68 |
|
|
- cluster name (string)
|
69 |
|
|
- node name (string)
|
70 |
|
|
- number of parameters (int)
|
71 |
|
|
- for each parameter:
|
72 |
|
|
- name (string)
|
73 |
|
|
- value type (int)
|
74 |
|
|
- value
|
75 |
|
|
- optionally a (int) with the given timestamp
|
76 |
|
|
|
77 |
|
|
Attributes (public):
|
78 |
|
|
- destinations - a list containing (ip, port, password) tuples
|
79 |
|
|
- configAddresses - list with files and urls from where the config is read
|
80 |
|
|
- configRecheckInterval - period, in seconds, to check for changes
|
81 |
|
|
in the configAddresses list
|
82 |
|
|
- configRecheck - boolean - whether to recheck periodically for changes
|
83 |
|
|
in the configAddresses list
|
84 |
|
|
"""
|
85 |
|
|
destinations = {} # empty, by default; key = tuple (host, port, pass) ; val = hash {"param_mame" : True/False, ...}
|
86 |
|
|
configAddresses = [] # empty, by default; list of files/urls from where we read config
|
87 |
|
|
configRecheckInterval = 120 # 2 minutes
|
88 |
|
|
configRecheck = True # enabled by default
|
89 |
|
|
performBgMonitoring = True # by default, perform background monitoring
|
90 |
|
|
monitoredJobs = {} # Monitored jobs; key = pid; value = hash with
|
91 |
|
|
|
92 |
|
|
__defaultOptions = {
|
93 |
|
|
'job_monitoring': True, # perform (or not) job monitoring
|
94 |
|
|
'job_interval' : 10, # at this interval (in seconds)
|
95 |
|
|
'job_data_sent' : 0, # time from Epoch when job information was sent; don't touch!
|
96 |
|
|
|
97 |
|
|
'job_cpu_time' : True, # elapsed time from the start of this job in seconds
|
98 |
|
|
'job_run_time' : True, # processor time spent running this job in seconds
|
99 |
|
|
'job_cpu_usage' : True, # current percent of the processor used for this job, as reported by ps
|
100 |
|
|
'job_virtualmem': True, # size in JB of the virtual memory occupied by the job, as reported by ps
|
101 |
|
|
'job_rss' : True, # size in KB of the resident image size of the job, as reported by ps
|
102 |
|
|
'job_mem_usage' : True, # percent of the memory occupied by the job, as reported by ps
|
103 |
|
|
'job_workdir_size': True, # size in MB of the working directory of the job
|
104 |
|
|
'job_disk_total': True, # size in MB of the total size of the disk partition containing the working directory
|
105 |
|
|
'job_disk_used' : True, # size in MB of the used disk partition containing the working directory
|
106 |
|
|
'job_disk_free' : True, # size in MB of the free disk partition containing the working directory
|
107 |
|
|
'job_disk_usage': True, # percent of the used disk partition containing the working directory
|
108 |
|
|
|
109 |
|
|
'sys_monitoring': True, # perform (or not) system monitoring
|
110 |
|
|
'sys_interval' : 10, # at this interval (in seconds)
|
111 |
|
|
'sys_data_sent' : 0, # time from Epoch when system information was sent; don't touch!
|
112 |
|
|
|
113 |
|
|
'sys_cpu_usr' : False, # cpu-usage information
|
114 |
|
|
'sys_cpu_sys' : False, # all these will produce coresponding paramas without "sys_"
|
115 |
|
|
'sys_cpu_nice' : False,
|
116 |
|
|
'sys_cpu_idle' : False,
|
117 |
|
|
'sys_cpu_usage' : True,
|
118 |
|
|
'sys_load1' : True, # system load information
|
119 |
|
|
'sys_load5' : True,
|
120 |
|
|
'sys_load15' : True,
|
121 |
|
|
'sys_mem_used' : False, # memory usage information
|
122 |
|
|
'sys_mem_free' : False,
|
123 |
|
|
'sys_mem_usage' : True,
|
124 |
|
|
'sys_pages_in' : False,
|
125 |
|
|
'sys_pages_out' : False,
|
126 |
|
|
'sys_swap_used' : True, # swap usage information
|
127 |
|
|
'sys_swap_free' : False,
|
128 |
|
|
'sys_swap_usage': True,
|
129 |
|
|
'sys_swap_in' : False,
|
130 |
|
|
'sys_swap_out' : False,
|
131 |
|
|
'sys_net_in' : True, # network transfer in kBps
|
132 |
|
|
'sys_net_out' : True, # these will produce params called ethX_in, ethX_out, ethX_errs
|
133 |
|
|
'sys_net_errs' : False, # for each eth interface
|
134 |
|
|
'sys_processes' : True,
|
135 |
|
|
'sys_uptime' : True, # uptime of the machine, in days (float number)
|
136 |
|
|
|
137 |
|
|
'general_info' : True, # send (or not) general host information once every 2 $sys_interval seconds
|
138 |
|
|
'general_data_sent': 0, # time from Epoch when general information was sent; don't touch!
|
139 |
|
|
|
140 |
|
|
'hostname' : True,
|
141 |
|
|
'ip' : True, # will produce ethX_ip params for each interface
|
142 |
|
|
'cpu_MHz' : True,
|
143 |
|
|
'no_CPUs' : True, # number of CPUs
|
144 |
|
|
'total_mem' : True,
|
145 |
|
|
'total_swap' : True};
|
146 |
|
|
|
147 |
|
|
def __init__ (self, initValue):
|
148 |
|
|
"""
|
149 |
|
|
Class constructor:
|
150 |
|
|
- if initValue is a string, put it in configAddresses and load destinations
|
151 |
|
|
from the file named like that. if it starts with "http://", the configuration
|
152 |
|
|
is loaded from that URL. For background monitoring, given parameters will overwrite defaults
|
153 |
|
|
|
154 |
|
|
- if initValue is a list, put its contents in configAddresses and create
|
155 |
|
|
the list of destinations from all those sources. For background monitoring,
|
156 |
|
|
given parameters will overwrite defaults (see __defaultOptions)
|
157 |
|
|
|
158 |
|
|
- if initValue is a tuple (of strings), initialize destinations with that values.
|
159 |
|
|
Strings in this tuple have this form: "{hostname|ip}[:port][ passwd]", the
|
160 |
|
|
default port being 8884 and the default password being "". Background monitoring will be
|
161 |
|
|
enabled sending the parameters active from __defaultOptions (see end of file)
|
162 |
|
|
|
163 |
|
|
- if initValue is a hash (key = string(hostname|ip[:port][ passwd]),
|
164 |
|
|
val = hash{'param_name': True/False, ...}) the given options for each destination
|
165 |
|
|
will overwrite the default parameters (see __defaultOptions)
|
166 |
|
|
"""
|
167 |
|
|
self.logger = Logger.Logger(self.__defaultLogLevel)
|
168 |
|
|
if type(initValue) == type("string"):
|
169 |
|
|
self.configAddresses.append(initValue)
|
170 |
|
|
self.__reloadAddresses()
|
171 |
|
|
elif type(initValue) == type([]):
|
172 |
|
|
self.configAddresses = initValue
|
173 |
|
|
self.__reloadAddresses()
|
174 |
|
|
elif type(initValue) == type(()):
|
175 |
|
|
for dest in initValue:
|
176 |
|
|
self.__addDestination (dest, self.destinations)
|
177 |
|
|
elif type(initValue) == type({}):
|
178 |
|
|
for dest, opts in initValue.items():
|
179 |
|
|
self.__addDestination (dest, self.destinations, opts)
|
180 |
|
|
|
181 |
|
|
self.__initializedOK = (len (self.destinations) > 0)
|
182 |
|
|
if not self.__initializedOK:
|
183 |
|
|
self.logger.log(Logger.ERROR, "Failed to initialize. No destination defined.");
|
184 |
|
|
#self.__defaultClusterName = None
|
185 |
|
|
#self.__defaultNodeName = self.getMyHo
|
186 |
|
|
if self.__initializedOK:
|
187 |
|
|
self.__udpSocket = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
188 |
|
|
if len(self.configAddresses) > 0:
|
189 |
|
|
# if there are addresses that need to be monitored,
|
190 |
|
|
# start config checking and reloading thread
|
191 |
|
|
th = threading.Thread(target=self.__configLoader)
|
192 |
|
|
th.setDaemon(True) # this is a daemon thread
|
193 |
|
|
th.start()
|
194 |
|
|
# create the ProcInfo instance
|
195 |
|
|
self.procInfo = ProcInfo.ProcInfo(self.logger);
|
196 |
|
|
self.procInfo.update();
|
197 |
|
|
# start the background monitoring thread
|
198 |
|
|
th = threading.Thread(target=self.__bgMonitor);
|
199 |
|
|
th.setDaemon(True);
|
200 |
|
|
th.start();
|
201 |
|
|
|
202 |
|
|
|
203 |
|
|
def sendParams (self, params):
|
204 |
|
|
"""
|
205 |
|
|
Send multiple parameters to MonALISA, with default (last given) cluser and node names.
|
206 |
|
|
"""
|
207 |
|
|
self.sendTimedParams (-1, params)
|
208 |
|
|
|
209 |
|
|
def sendTimedParams (self, timeStamp, params):
|
210 |
|
|
"""
|
211 |
|
|
Send multiple parameters, specifying the time for them, with default (last given) cluster and node names.
|
212 |
|
|
(See sendTimedParameters for more details)
|
213 |
|
|
"""
|
214 |
|
|
self.sendTimedParameters (None, None, timeStamp, params);
|
215 |
|
|
|
216 |
|
|
def sendParameter (self, clusterName, nodeName, paramName, paramValue):
|
217 |
|
|
"""
|
218 |
|
|
Send a single parameter to MonALISA.
|
219 |
|
|
"""
|
220 |
|
|
self.sendTimedParameter(clusterName, nodeName, -1, paramName, paramValue);
|
221 |
|
|
|
222 |
|
|
def sendTimedParameter (self, clusterName, nodeName, timeStamp, paramName, paramValue):
|
223 |
|
|
"""
|
224 |
|
|
Send a single parameter, with a given time.
|
225 |
|
|
"""
|
226 |
|
|
self.sendTimedParameters (clusterName, nodeName, timeStamp, {paramName:paramValue})
|
227 |
|
|
|
228 |
|
|
def sendParameters (self, clusterName, nodeName, params):
|
229 |
|
|
"""
|
230 |
|
|
Send multiple parameters specifying cluster and node name for them
|
231 |
|
|
"""
|
232 |
|
|
self.sendTimedParameters (clusterName, nodeName, -1, params);
|
233 |
|
|
|
234 |
|
|
def sendTimedParameters (self, clusterName, nodeName, timeStamp, params):
|
235 |
|
|
"""
|
236 |
|
|
Send multiple monitored parameters to MonALISA.
|
237 |
|
|
|
238 |
|
|
- clusterName is the name of the cluster being monitored. The first
|
239 |
|
|
time this function is called, this paramenter must not be None. Then,
|
240 |
|
|
it can be None; last given clusterName will be used instead.
|
241 |
|
|
- nodeName is the name of the node for which are the parameters. If this
|
242 |
|
|
is None, the full hostname of this machine will be sent instead.
|
243 |
|
|
- timeStamp, if > 0, is given time for the parameters. This is in seconds from Epoch.
|
244 |
|
|
Note that this option should be used only if you are sure about the time for the result.
|
245 |
|
|
Otherwize, the parameters will be assigned a correct time (obtained from NTP servers)
|
246 |
|
|
in MonALISA service. This option can be usefull when parsing logs, for example.
|
247 |
|
|
- params is a dictionary containing pairs with:
|
248 |
|
|
- key: parameter name
|
249 |
|
|
- value: parameter value, either int or float.
|
250 |
|
|
or params is a vector of tuples (key, value). This version can be used
|
251 |
|
|
in case you want to send the parameters in a given order.
|
252 |
|
|
|
253 |
|
|
NOTE that python doesn't know about 32-bit floats (only 64-bit floats!)
|
254 |
|
|
"""
|
255 |
|
|
if not self.__initializedOK:
|
256 |
|
|
self.logger.log(Logger.WARNING, "Not initialized correctly. Message NOT sent!");
|
257 |
|
|
return
|
258 |
|
|
if clusterName == None:
|
259 |
|
|
clusterName = self.__defaultUserCluster
|
260 |
|
|
else:
|
261 |
|
|
self.__defaultUserCluster = clusterName
|
262 |
|
|
if nodeName == None:
|
263 |
|
|
nodeName = self.__defaultUserNode
|
264 |
|
|
else:
|
265 |
|
|
self.__defaultUserNode = nodeName
|
266 |
|
|
self.__configUpdateLock.acquire()
|
267 |
|
|
for dest in self.destinations.keys():
|
268 |
|
|
self.__directSendParams(dest, clusterName, nodeName, timeStamp, params);
|
269 |
|
|
self.__configUpdateLock.release()
|
270 |
|
|
|
271 |
|
|
def addJobToMonitor (self, pid, workDir, clusterName, nodeName):
|
272 |
|
|
"""
|
273 |
|
|
Add a new job to monitor.
|
274 |
|
|
"""
|
275 |
|
|
self.__bgMonitorLock.acquire();
|
276 |
|
|
self.monitoredJobs[pid] = {};
|
277 |
|
|
self.monitoredJobs[pid]['CLUSTER_NAME'] = clusterName;
|
278 |
|
|
self.monitoredJobs[pid]['NODE_NAME'] = nodeName;
|
279 |
|
|
self.procInfo.addJobToMonitor(pid, workDir);
|
280 |
|
|
self.__bgMonitorLock.release();
|
281 |
|
|
|
282 |
|
|
def removeJobToMonitor (self, pid):
|
283 |
|
|
"""
|
284 |
|
|
Remove a job from being monitored.
|
285 |
|
|
"""
|
286 |
|
|
self.__bgMonitorLock.acquire();
|
287 |
|
|
self.procInfo.removeJobToMonitor(pid);
|
288 |
|
|
del self.monitoredJobs[pid];
|
289 |
|
|
self.__bgMonitorLock.release();
|
290 |
|
|
|
291 |
|
|
def setMonitorClusterNode (self, clusterName, nodeName):
|
292 |
|
|
"""
|
293 |
|
|
Set the cluster and node names where to send system related information.
|
294 |
|
|
"""
|
295 |
|
|
self.__bgMonitorLock.acquire();
|
296 |
|
|
self.__defaultSysMonCluster = clusterName;
|
297 |
|
|
self.__defaultSysMonNode = nodeName;
|
298 |
|
|
self.__bgMonitorLock.release();
|
299 |
|
|
|
300 |
|
|
def enableBgMonitoring (self, onOff):
|
301 |
|
|
"""
|
302 |
|
|
Enable or disable background monitoring. Note that background monitoring information
|
303 |
|
|
can still be sent if user calls the sendBgMonitoring method.
|
304 |
|
|
"""
|
305 |
|
|
self.performBgMonitoring = onOff;
|
306 |
|
|
|
307 |
|
|
def sendBgMonitoring (self):
|
308 |
|
|
"""
|
309 |
|
|
Send now background monitoring about system and jobs to all interested destinations.
|
310 |
|
|
"""
|
311 |
|
|
self.__bgMonitorLock.acquire();
|
312 |
|
|
self.procInfo.update();
|
313 |
|
|
now = int(time.time());
|
314 |
|
|
for destination, options in self.destinations.items():
|
315 |
|
|
sysParams = [];
|
316 |
|
|
jobParams = [];
|
317 |
|
|
# for each destination and its options, check if we have to report any background monitoring data
|
318 |
|
|
if(options['sys_monitoring'] and options['sys_data_sent'] + options['sys_interval'] < now):
|
319 |
|
|
for param, active in options.items():
|
320 |
|
|
m = re.match("sys_(.+)", param);
|
321 |
|
|
if(m != None and active):
|
322 |
|
|
param = m.group(1);
|
323 |
|
|
if not (param == 'monitoring' or param == 'interval' or param == 'data_sent'):
|
324 |
|
|
sysParams.append(param)
|
325 |
|
|
options['sys_data_sent'] = now;
|
326 |
|
|
if(options['job_monitoring'] and options['job_data_sent'] + options['job_interval'] < now):
|
327 |
|
|
for param, active in options.items():
|
328 |
|
|
m = re.match("job_(.+)", param);
|
329 |
|
|
if(m != None and active):
|
330 |
|
|
param = m.group(1);
|
331 |
|
|
if not (param == 'monitoring' or param == 'interval' or param == 'data_sent'):
|
332 |
|
|
jobParams.append(param);
|
333 |
|
|
options['job_data_sent'] = now;
|
334 |
|
|
if(options['general_info'] and options['general_data_sent'] + 2 * int(options['sys_interval']) < now):
|
335 |
|
|
for param, active in options.items():
|
336 |
|
|
if not (param.startswith("sys_") or param.startswith("job_")) and active:
|
337 |
|
|
if not (param == 'general_info' or param == 'general_data_sent'):
|
338 |
|
|
sysParams.append(param);
|
339 |
|
|
sysResults = {}
|
340 |
|
|
if(len(sysParams) > 0):
|
341 |
|
|
sysResults = self.procInfo.getSystemData(sysParams);
|
342 |
|
|
if(len(sysResults.keys()) > 0):
|
343 |
|
|
self.__directSendParams(destination, self.__defaultSysMonCluster, self.__defaultSysMonNode, -1, sysResults);
|
344 |
|
|
for pid, props in self.monitoredJobs.items():
|
345 |
|
|
jobResults = {};
|
346 |
|
|
if(len(jobParams) > 0):
|
347 |
|
|
jobResults = self.procInfo.getJobData(pid, jobParams);
|
348 |
|
|
if(len(jobResults) > 0):
|
349 |
|
|
self.__directSendParams(destination, props['CLUSTER_NAME'], props['NODE_NAME'], -1, jobResults);
|
350 |
|
|
self.__bgMonitorLock.release();
|
351 |
|
|
|
352 |
|
|
def setLogLevel (self, strLevel):
|
353 |
|
|
"""
|
354 |
|
|
Change the log level. Given level is a string, one of 'FATAL', 'ERROR', 'WARNING',
|
355 |
|
|
'INFO', 'NOTICE', 'DEBUG'.
|
356 |
|
|
"""
|
357 |
|
|
self.logger.setLogLevel(strLevel);
|
358 |
|
|
|
359 |
|
|
#########################################################################################
|
360 |
|
|
# Internal functions - Config reloader thread
|
361 |
|
|
#########################################################################################
|
362 |
|
|
|
363 |
|
|
def __configLoader(self):
|
364 |
|
|
"""
|
365 |
|
|
Main loop of the thread that checks for changes and reloads the configuration
|
366 |
|
|
"""
|
367 |
|
|
while True:
|
368 |
|
|
time.sleep(self.configRecheckInterval)
|
369 |
|
|
if self.configRecheck:
|
370 |
|
|
self.__reloadAddresses()
|
371 |
|
|
self.logger.log(Logger.DEBUG, "Config reloaded. Seleeping for "+`self.configRecheckInterval`+" sec.");
|
372 |
|
|
|
373 |
|
|
def __reloadAddresses(self):
|
374 |
|
|
"""
|
375 |
|
|
Refresh destinations hash, by loading data from all sources in configAddresses
|
376 |
|
|
"""
|
377 |
|
|
newDestinations = {}
|
378 |
|
|
for src in self.configAddresses:
|
379 |
|
|
self.__initializeFromFile(src, newDestinations)
|
380 |
|
|
# avoid changing config in the middle of sending packets to previous destinations
|
381 |
|
|
self.__configUpdateLock.acquire()
|
382 |
|
|
self.destinations = newDestinations
|
383 |
|
|
self.__configUpdateLock.release()
|
384 |
|
|
|
385 |
|
|
def __addDestination (self, aDestination, tempDestinations, options = __defaultOptions):
|
386 |
|
|
"""
|
387 |
|
|
Add a destination to the list.
|
388 |
|
|
|
389 |
|
|
aDestination is a string of the form "{hostname|ip}[:port] [passwd]" without quotes.
|
390 |
|
|
If the port is not given, it will be used the default port (8884)
|
391 |
|
|
If the password is missing, it will be considered an empty string
|
392 |
|
|
"""
|
393 |
|
|
aDestination = aDestination.strip().replace('\t', ' ')
|
394 |
|
|
while aDestination != aDestination.replace(' ', ' '):
|
395 |
|
|
aDestination = aDestination.replace(' ', ' ')
|
396 |
|
|
sepPort = aDestination.find (':')
|
397 |
|
|
sepPasswd = aDestination.rfind (' ')
|
398 |
|
|
if sepPort >= 0:
|
399 |
|
|
host = aDestination[0:sepPort].strip()
|
400 |
|
|
if sepPasswd > sepPort + 1:
|
401 |
|
|
port = aDestination[sepPort+1:sepPasswd].strip()
|
402 |
|
|
passwd = aDestination[sepPasswd:].strip()
|
403 |
|
|
else:
|
404 |
|
|
port = aDestination[sepPort+1:].strip()
|
405 |
|
|
passwd = ""
|
406 |
|
|
else:
|
407 |
|
|
port = str(self.__defaultPort)
|
408 |
|
|
if sepPasswd >= 0:
|
409 |
|
|
host = aDestination[0:sepPasswd].strip()
|
410 |
|
|
passwd = aDestination[sepPasswd:].strip()
|
411 |
|
|
else:
|
412 |
|
|
host = aDestination.strip()
|
413 |
|
|
passwd = ""
|
414 |
|
|
if (not port.isdigit()):
|
415 |
|
|
self.logger.log(Logger.WARNING, "Bad value for port number "+`port`+" in "+aDestination+" destination");
|
416 |
|
|
return
|
417 |
|
|
alreadyAdded = False
|
418 |
|
|
port = int(port)
|
419 |
|
|
host = socket.gethostbyname(host) # convert hostnames to IP addresses to avoid suffocating DNSs
|
420 |
|
|
for h, p, w in tempDestinations.keys():
|
421 |
|
|
if (h == host) and (p == port):
|
422 |
|
|
alreadyAdded = True
|
423 |
|
|
break
|
424 |
|
|
if not alreadyAdded:
|
425 |
|
|
self.logger.log(Logger.INFO, "Adding destination "+host+':'+`port`+' '+passwd);
|
426 |
|
|
tempDestinations[(host, port, passwd)] = self.__defaultOptions;
|
427 |
|
|
if options != self.__defaultOptions:
|
428 |
|
|
# we have to overwrite defaults with given options
|
429 |
|
|
for key, value in options.items():
|
430 |
|
|
self.logger.log(Logger.NOTICE, "Overwritting option: "+key+" = "+`value`);
|
431 |
|
|
tempDestinations[(host, port, passwd)][key] = value;
|
432 |
|
|
else:
|
433 |
|
|
self.logger.log(Logger.NOTICE, "Destination "+host+":"+port+" "+passwd+" already added. Skipping it");
|
434 |
|
|
|
435 |
|
|
def __initializeFromFile (self, confFileName, tempDestinations):
|
436 |
|
|
"""
|
437 |
|
|
Load destinations from confFileName file. If it's an URL (starts with "http://")
|
438 |
|
|
load configuration from there. Put all destinations in tempDestinations hash.
|
439 |
|
|
|
440 |
|
|
Calls addDestination for each line that doesn't start with # and
|
441 |
|
|
has non-whitespace characters on it
|
442 |
|
|
"""
|
443 |
|
|
try:
|
444 |
|
|
if confFileName.find ("http://") == 0:
|
445 |
|
|
confFile = urllib2.urlopen (confFileName)
|
446 |
|
|
else:
|
447 |
|
|
confFile = open (confFileName)
|
448 |
|
|
except urllib2.HTTPError, e:
|
449 |
|
|
self.logger.log(Logger.ERROR, "Cannot open "+confFileName);
|
450 |
|
|
if e.code == 401:
|
451 |
|
|
self.logger.log(Logger.ERROR, 'HTTPError: not authorized.');
|
452 |
|
|
elif e.code == 404:
|
453 |
|
|
self.logger.log(Logger.ERROR, 'HTTPError: not found.');
|
454 |
|
|
elif e.code == 503:
|
455 |
|
|
self.logger.log(Logger.ERROR, 'HTTPError: service unavailable.');
|
456 |
|
|
else:
|
457 |
|
|
self.logger.log(Logger.ERROR, 'HTTPError: unknown error.');
|
458 |
|
|
return
|
459 |
|
|
except urllib2.URLError, ex:
|
460 |
|
|
self.logger.log(Logger.ERROR, "Cannot open "+confFileName);
|
461 |
|
|
self.logger.log(Logger.ERROR, "URL Error: "+str(ex.reason[1]));
|
462 |
|
|
return
|
463 |
|
|
except IOError, ex:
|
464 |
|
|
self.logger.log(Logger.ERROR, "Cannot open "+confFileName);
|
465 |
|
|
self.logger.log(Logger.ERROR, "IOError: "+str(ex));
|
466 |
|
|
return
|
467 |
|
|
self.logger.log(Logger.INFO, "Adding destinations from "+confFileName);
|
468 |
|
|
dests = []
|
469 |
|
|
opts = {}
|
470 |
|
|
while(True):
|
471 |
|
|
line = confFile.readline();
|
472 |
|
|
if line == '':
|
473 |
|
|
break;
|
474 |
|
|
line = line.strip()
|
475 |
|
|
self.logger.log(Logger.DEBUG, "Reading line "+line);
|
476 |
|
|
if (len(line) == 0) or (line[0] == '#'):
|
477 |
|
|
continue
|
478 |
|
|
elif line.startswith("xApMon_"):
|
479 |
|
|
m = re.match("xApMon_(.*)", line);
|
480 |
|
|
if m != None:
|
481 |
|
|
m = re.match("(\S+)\s*=\s*(\S+)", m.group(1));
|
482 |
|
|
if m != None:
|
483 |
|
|
param = m.group(1); value = m.group(2);
|
484 |
|
|
if(value.upper() == "ON"):
|
485 |
|
|
value = True;
|
486 |
|
|
elif(value.upper() == "OFF"):
|
487 |
|
|
value = False;
|
488 |
|
|
elif(param.endswith("_interval")):
|
489 |
|
|
value = int(value);
|
490 |
|
|
if param == "loglevel":
|
491 |
|
|
self.logger.setLogLevel(value);
|
492 |
|
|
elif param == "conf_recheck":
|
493 |
|
|
self.configRecheck = value;
|
494 |
|
|
elif param == "recheck_interval":
|
495 |
|
|
self.configRecheckInterval = value;
|
496 |
|
|
elif param.endswith("_data_sent"):
|
497 |
|
|
pass; # don't reset time in sys/job/general/_data_sent
|
498 |
|
|
else:
|
499 |
|
|
opts[param] = value;
|
500 |
|
|
else:
|
501 |
|
|
dests.append(line);
|
502 |
|
|
confFile.close ()
|
503 |
|
|
for line in dests:
|
504 |
|
|
self.__addDestination(line, tempDestinations, opts)
|
505 |
|
|
|
506 |
|
|
###############################################################################################
|
507 |
|
|
# Internal functions - Background monitor thread
|
508 |
|
|
###############################################################################################
|
509 |
|
|
|
510 |
|
|
def __bgMonitor (self):
|
511 |
|
|
while True:
|
512 |
|
|
time.sleep(10);
|
513 |
|
|
if self.performBgMonitoring:
|
514 |
|
|
self.sendBgMonitoring();
|
515 |
|
|
|
516 |
|
|
###############################################################################################
|
517 |
|
|
# Internal helper functions
|
518 |
|
|
###############################################################################################
|
519 |
|
|
|
520 |
|
|
def __directSendParams (self, destination, clusterName, nodeName, timeStamp, params):
|
521 |
|
|
xdrPacker = xdrlib.Packer ()
|
522 |
|
|
host, port, passwd = destination
|
523 |
|
|
self.logger.log(Logger.DEBUG, "Building XDR packet for ["+str(clusterName)+"] <"+str(nodeName)+"> len:"+str(len(params)));
|
524 |
|
|
xdrPacker.pack_string ("v:"+self.__version+"p:"+passwd)
|
525 |
|
|
xdrPacker.pack_string (clusterName)
|
526 |
|
|
xdrPacker.pack_string (nodeName)
|
527 |
|
|
xdrPacker.pack_int (len(params))
|
528 |
|
|
if type(params) == type( {} ):
|
529 |
|
|
for name, value in params.iteritems():
|
530 |
|
|
self.__packParameter(xdrPacker, name, value)
|
531 |
|
|
elif type(params) == type( [] ):
|
532 |
|
|
for name, value in params:
|
533 |
|
|
self.logger.log(Logger.DEBUG, "Adding parameter "+name+" = "+str(value));
|
534 |
|
|
self.__packParameter(xdrPacker, name, value)
|
535 |
|
|
else:
|
536 |
|
|
self.logger.log(Logger.WARNING, "Unsupported params type in sendParameters: " + str(type(params)));
|
537 |
|
|
if(timeStamp > 0):
|
538 |
|
|
xdrPacker.pack_int(timeStamp);
|
539 |
|
|
buffer = xdrPacker.get_buffer();
|
540 |
|
|
# send this buffer to the destination, using udp datagrams
|
541 |
|
|
try:
|
542 |
|
|
self.__udpSocket.sendto(buffer, (host, port))
|
543 |
|
|
self.logger.log(Logger.DEBUG, "Packet sent");
|
544 |
|
|
except socket.error, msg:
|
545 |
|
|
self.logger.log(Logger.ERROR, "Cannot send packet to "+host+":"+str(port)+" "+passwd+": "+str(msg[1]));
|
546 |
|
|
xdrPacker.reset()
|
547 |
|
|
|
548 |
|
|
def __packParameter(self, xdrPacker, name, value):
|
549 |
|
|
try:
|
550 |
|
|
typeValue = self.__valueTypes[type(value)]
|
551 |
|
|
xdrPacker.pack_string (name)
|
552 |
|
|
xdrPacker.pack_int (typeValue)
|
553 |
|
|
self.__packFunctions[typeValue] (xdrPacker, value)
|
554 |
|
|
self.logger.log(Logger.DEBUG, "Adding parameter "+str(name)+" = "+str(value));
|
555 |
|
|
except Exception, ex:
|
556 |
|
|
print "ApMon: error packing %s = %s; got %s" % (name, str(value), ex)
|
557 |
|
|
|
558 |
|
|
# Destructor
|
559 |
|
|
def __del__(self):
|
560 |
|
|
self.__udpSocket.close();
|
561 |
|
|
|
562 |
|
|
################################################################################################
|
563 |
|
|
# Private variables. Don't touch
|
564 |
|
|
################################################################################################
|
565 |
|
|
|
566 |
|
|
__valueTypes = {
|
567 |
|
|
type("string"): 0, # XDR_STRING (see ApMon.h from C/C++ ApMon version)
|
568 |
|
|
type(1): 2, # XDR_INT32
|
569 |
|
|
type(1.0): 5}; # XDR_REAL64
|
570 |
|
|
|
571 |
|
|
__packFunctions = {
|
572 |
|
|
0: xdrlib.Packer.pack_string,
|
573 |
|
|
2: xdrlib.Packer.pack_int,
|
574 |
|
|
5: xdrlib.Packer.pack_double }
|
575 |
|
|
|
576 |
|
|
__defaultUserCluster = "ApMon_UserSend";
|
577 |
|
|
__defaultUserNode = socket.getfqdn();
|
578 |
|
|
__defaultSysMonCluster = "ApMon_SysMon";
|
579 |
|
|
__defaultSysMonNode = socket.getfqdn();
|
580 |
|
|
|
581 |
|
|
__initializedOK = True
|
582 |
|
|
__configUpdateLock = threading.Lock()
|
583 |
|
|
__bgMonitorLock = threading.Lock()
|
584 |
|
|
|
585 |
|
|
__defaultPort = 8884
|
586 |
|
|
__defaultLogLevel = Logger.INFO
|
587 |
|
|
__version = "2.0.2-py" # apMon version number
|
588 |
|
|
|