Skip to content

Commit 3aae3c9

Browse files
committed
Closes #1012: monitoring log rotation
Closes #1027: monitoring log in the arbiter - still to be improved
1 parent 62f5c13 commit 3aae3c9

File tree

96 files changed

+1240
-1147
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

96 files changed

+1240
-1147
lines changed

alignak/brok.py

+18-2
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,9 @@ class Brok(object):
5959
"""A Brok is a piece of information exported by Alignak to the Broker.
6060
Broker can do whatever he wants with it.
6161
62+
A specific type of Brok exists when the type is monitoring_log. This Brok contains
63+
a monitoring event (alert, notification, ...) information
64+
6265
Broks types:
6366
- log
6467
- monitoring_log
@@ -105,11 +108,24 @@ def __init__(self, params, parsing=True):
105108
else:
106109
self.data = serialize(params['data'])
107110

108-
def __repr__(self): # pragma: no cover
111+
def __repr__(self):
109112
ct = datetime.fromtimestamp(self.creation_time).strftime("%Y-%m-%d %H:%M:%S.%f")
110113
return "Brok %s (%s) '%s': %s" % (self.uuid, ct, self.type, self.data)
111114
__str__ = __repr__
112115

116+
def get_event(self):
117+
"""This function returns an Event from a Brok
118+
119+
If the type is monitoring_log then the Brok contains a monitoring event
120+
(alert, notification, ...) information. This function will return a tuple
121+
with the creation time, the level and message information
122+
123+
:return: tuple with date, level and message
124+
:rtype: tuple
125+
"""
126+
self.prepare()
127+
return (self.creation_time, self.data['level'], self.data['message'])
128+
113129
def serialize(self):
114130
"""This function serialize into a simple dict object.
115131
It is used when transferring data to other daemons over the network (http)
@@ -130,7 +146,7 @@ def prepare(self):
130146
131147
:return: None
132148
"""
133-
# Maybe the brok is a old daemon one or was already prepared
149+
# Maybe the Brok is a old daemon one or was already prepared
134150
# if so, the data is already ok
135151
if hasattr(self, 'prepared') and not self.prepared:
136152
try:

alignak/daemon.py

+19-20
Original file line numberDiff line numberDiff line change
@@ -626,7 +626,7 @@ def __init__(self, name, **kwargs):
626626
self.check_dir(os.path.dirname(self.log_filename))
627627

628628
# Specific monitoring log directory
629-
self.check_dir(os.path.join(os.path.dirname(self.log_filename), 'monitoring-log'))
629+
# self.check_dir(os.path.join(os.path.dirname(self.log_filename), 'monitoring-log'))
630630

631631
if 'log_filename' not in kwargs or not kwargs['log_filename']:
632632
# Log file name is not overridden, the logger will use the configured default one
@@ -1181,8 +1181,8 @@ def do_load_modules(self, modules):
11811181
statsmgr.timer('modules.load-time', time.time() - _ts)
11821182

11831183
def add(self, elt):
1184-
""" Abstract method for adding brok
1185-
It is overridden in subclasses of Daemon
1184+
""" Abstract method for adding brok, external commands, messages, ...
1185+
It is overridden in subclasses (Satellite) of Daemon
11861186
11871187
:param elt: element to add
11881188
:type elt:
@@ -1485,21 +1485,21 @@ def do_daemon_init_and_start(self, set_proc_title=True):
14851485
# Else, I set my own pid as the reference one
14861486
self.write_pid(os.getpid())
14871487

1488-
# TODO: check if really necessary!
1489-
# -------
1490-
# Set ownership on some default log files. It may happen that these default
1491-
# files are owned by a privileged user account
1492-
try:
1493-
for log_file in ['alignak.log', 'monitoring-logs.log']:
1494-
if os.path.exists('/tmp/%s' % log_file):
1495-
with open('/tmp/%s' % log_file, "w") as file_log_file:
1496-
os.fchown(file_log_file.fileno(), self.uid, self.gid)
1497-
if os.path.exists('/tmp/monitoring-log/%s' % log_file):
1498-
with open('/tmp/monitoring-log/%s' % log_file, "w") as file_log_file:
1499-
os.fchown(file_log_file.fileno(), self.uid, self.gid)
1500-
except Exception as exp: # pylint: disable=broad-except
1501-
# pragma: no cover
1502-
print("Could not set default log files ownership, exception: %s" % str(exp))
1488+
# # TODO: check if really necessary!
1489+
# # -------
1490+
# # Set ownership on some default log files. It may happen that these default
1491+
# # files are owned by a privileged user account
1492+
# try:
1493+
# for log_file in ['alignak.log', 'alignak-events.log']:
1494+
# if os.path.exists('/tmp/%s' % log_file):
1495+
# with open('/tmp/%s' % log_file, "w") as file_log_file:
1496+
# os.fchown(file_log_file.fileno(), self.uid, self.gid)
1497+
# if os.path.exists('/tmp/monitoring-log/%s' % log_file):
1498+
# with open('/tmp/monitoring-log/%s' % log_file, "w") as file_log_file:
1499+
# os.fchown(file_log_file.fileno(), self.uid, self.gid)
1500+
# except Exception as exp: # pylint: disable=broad-except
1501+
# # pragma: no cover
1502+
# print("Could not set default log files ownership, exception: %s" % str(exp))
15031503

15041504
# Configure the daemon logger
15051505
self.setup_alignak_logger()
@@ -1910,8 +1910,7 @@ def watch_for_new_conf(self, timeout=0):
19101910
return any(self.new_conf)
19111911

19121912
def hook_point(self, hook_name, handle=None):
1913-
"""Used to call module function that may define a hook function
1914-
for hook_name
1913+
"""Used to call module function that may define a hook function for hook_name
19151914
19161915
Available hook points:
19171916
- `tick`, called on each daemon loop turn

alignak/daemons/arbiterdaemon.py

+30-11
Original file line numberDiff line numberDiff line change
@@ -173,8 +173,15 @@ def __init__(self, **kwargs):
173173
except ValueError: # pragma: no cover, simple protection
174174
pass
175175

176+
# This because it is the Satellite that has these properties and I am a Daemon
177+
# todo: change this?
178+
# My own broks
176179
self.broks = []
177180
self.broks_lock = threading.RLock()
181+
# My own monitoring events
182+
self.events = []
183+
self.events_lock = threading.RLock()
184+
178185
self.is_master = False
179186
self.link_to_myself = None
180187
self.instance_id = None
@@ -210,9 +217,16 @@ def add(self, elt):
210217
:return: None
211218
"""
212219
if isinstance(elt, Brok):
220+
# For brok, we tag the brok with our instance_id
213221
elt.instance_id = self.instance_id
214-
with self.broks_lock:
215-
self.broks.append(elt)
222+
if elt.type == 'monitoring_log':
223+
# The brok is a monitoring event
224+
with self.events_lock:
225+
self.events.append(elt)
226+
statsmgr.counter('events', 1)
227+
else:
228+
with self.broks_lock:
229+
self.broks.append(elt)
216230
statsmgr.counter('broks.added', 1)
217231
elif isinstance(elt, ExternalCommand):
218232
logger.debug("Queuing an external command '%s'", str(elt.__dict__))
@@ -1159,14 +1173,21 @@ def daemons_reachability_check(self):
11591173

11601174
_t0 = time.time()
11611175
logger.debug("Alignak daemons reachability check")
1162-
self.dispatcher.check_reachable()
1176+
result = self.dispatcher.check_reachable()
11631177
statsmgr.timer('dispatcher.check-alive', time.time() - _t0)
11641178

11651179
_t0 = time.time()
11661180
logger.debug("Alignak daemons status get")
1167-
result = self.dispatcher.check_status()
1168-
statsmgr.timer('dispatcher.check-status', time.time() - _t0)
1169-
logger.debug("Getting daemons status duration: %.2f seconds", time.time() - _t0)
1181+
events = self.dispatcher.check_status_and_get_events()
1182+
duration = time.time() - _t0
1183+
statsmgr.timer('dispatcher.check-status', duration)
1184+
logger.debug("Getting daemons status duration: %.2f seconds", duration)
1185+
1186+
# Send the collected events to the Alignak logger
1187+
for event in events:
1188+
event.prepare()
1189+
make_monitoring_log(event.data['level'], event.data['message'],
1190+
timestamp=event.creation_time, to_logger=True)
11701191

11711192
# Set the last check as now
11721193
self.daemons_last_reachable_check = start
@@ -2060,11 +2081,9 @@ def main(self):
20602081
_ts = time.time()
20612082
logger.warning('--- Reloading configuration...')
20622083
self.load_monitoring_config_file()
2063-
brok = make_monitoring_log('info', 'CONFIGURATION RELOAD')
2064-
if self.conf.monitoring_log_broks:
2065-
self.add(brok)
2066-
logger.warning('--- Configuration reloaded, %.2f seconds',
2067-
time.time() - _ts)
2084+
duration = int(time.time() - _ts)
2085+
self.add(make_monitoring_log('info', 'CONFIGURATION RELOAD;%d' % duration))
2086+
logger.warning('--- Configuration reloaded, %d seconds', duration)
20682087

20692088
# Make a pause to let our satellites get ready...
20702089
pause = max(1, self.conf.daemons_new_conf_timeout)

alignak/daemons/brokerdaemon.py

+10-4
Original file line numberDiff line numberDiff line change
@@ -136,10 +136,16 @@ def add(self, elt):
136136
:return: None
137137
"""
138138
if isinstance(elt, Brok):
139-
# We tag the broks with our instance_id
139+
# For brok, we tag the brok with our instance_id
140140
elt.instance_id = self.instance_id
141-
with self.broks_lock:
142-
self.internal_broks.append(elt)
141+
if elt.type == 'monitoring_log':
142+
# The brok is a monitoring event
143+
with self.events_lock:
144+
self.events.append(elt)
145+
statsmgr.counter('events', 1)
146+
else:
147+
with self.broks_lock:
148+
self.broks.append(elt)
143149
statsmgr.counter('broks.added', 1)
144150
elif isinstance(elt, ExternalCommand):
145151
logger.debug("Queuing an external command '%s'", str(elt.__dict__))
@@ -319,7 +325,7 @@ def setup_new_conf(self):
319325
new_link = SatelliteLink.get_a_satellite_link(link_type[:-1],
320326
rs_conf)
321327
my_satellites[new_link.uuid] = new_link
322-
logger.info("I got a new %s satellite: %s", link_type, new_link)
328+
logger.info("I got a new %s satellite: %s", link_type[:-1], new_link)
323329
# print("My new %s satellite: %s" % (link_type, new_link))
324330

325331
new_link.running_id = running_id

alignak/daemons/receiverdaemon.py

+9-3
Original file line numberDiff line numberDiff line change
@@ -121,10 +121,16 @@ def add(self, elt):
121121
elt = ExternalCommand(elt['cmd_line'], elt['creation_timestamp'])
122122

123123
if isinstance(elt, Brok):
124-
# We tag the broks with our instance_id
124+
# For brok, we tag the brok with our instance_id
125125
elt.instance_id = self.instance_id
126-
with self.broks_lock:
127-
self.broks.append(elt)
126+
if elt.type == 'monitoring_log':
127+
# The brok is a monitoring event
128+
with self.events_lock:
129+
self.events.append(elt)
130+
statsmgr.counter('events', 1)
131+
else:
132+
with self.broks_lock:
133+
self.broks.append(elt)
128134
statsmgr.counter('broks.added', 1)
129135
elif isinstance(elt, ExternalCommand):
130136
logger.debug("Queuing an external command: %s", str(ExternalCommand.__dict__))

alignak/daemons/schedulerdaemon.py

+17-8
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,10 @@ def __init__(self, **kwargs):
113113
self.reactionners = {}
114114
self.receivers = {}
115115

116+
# This because it is the Satellite that has thes properties and I am a Satellite
117+
# todo: change this?
118+
# Broks are stored in each broker link, not locally
119+
# self.broks = []
116120
self.broks_lock = threading.RLock()
117121

118122
# Modules are only loaded one time
@@ -287,14 +291,19 @@ def get_managed_configurations(self):
287291
hash, push_flavor and configuration identifier as values
288292
:rtype: dict
289293
"""
294+
# for scheduler_link in list(self.schedulers.values()):
295+
# res[scheduler_link.instance_id] = {
296+
# 'hash': scheduler_link.hash,
297+
# 'push_flavor': scheduler_link.push_flavor,
298+
# 'managed_conf_id': scheduler_link.managed_conf_id
299+
# }
300+
290301
res = {}
291-
if self.sched.pushed_conf and self.cur_conf:
292-
res = {
293-
self.cur_conf['instance_id']: {
294-
'hash': self.cur_conf['hash'],
295-
'push_flavor': self.cur_conf['push_flavor'],
296-
'managed_conf_id': self.cur_conf['managed_conf_id']
297-
}
302+
if self.sched.pushed_conf and self.cur_conf and 'instance_id' in self.cur_conf:
303+
res[self.cur_conf['instance_id']] = {
304+
'hash': self.cur_conf['hash'],
305+
'push_flavor': self.cur_conf['push_flavor'],
306+
'managed_conf_id': self.cur_conf['managed_conf_id']
298307
}
299308
logger.debug("Get managed configuration: %s", res)
300309
return res
@@ -386,7 +395,7 @@ def setup_new_conf(self):
386395
new_link = SatelliteLink.get_a_satellite_link(link_type[:-1],
387396
rs_conf)
388397
my_satellites[new_link.uuid] = new_link
389-
logger.info("I got a new %s satellite: %s", link_type, new_link)
398+
logger.info("I got a new %s satellite: %s", link_type[:-1], new_link)
390399
# print("My new %s satellite: %s" % (link_type, new_link))
391400

392401
new_link.running_id = running_id

alignak/dispatcher.py

+14-5
Original file line numberDiff line numberDiff line change
@@ -265,14 +265,14 @@ def check_reachable(self, forced=False, test=False):
265265
self.not_configured.append(daemon_link)
266266

267267
if self.not_configured and self.new_to_dispatch and not self.first_dispatch_done:
268-
logger.info("Dispatcher, those daemons are not configured: %s, "
268+
logger.info("Dispatcher, these daemons are not configured: %s, "
269269
"and a configuration is ready to dispatch, run the dispatching...",
270270
','.join(d.name for d in self.not_configured))
271271
self.dispatch_ok = False
272272
self.dispatch(test=test)
273273

274274
elif self.not_configured and self.first_dispatch_done:
275-
logger.info("Dispatcher, those daemons are not configured: %s, "
275+
logger.info("Dispatcher, these daemons are not configured: %s, "
276276
"and a configuration has yet been dispatched dispatch, "
277277
"a new dispatch is required...",
278278
','.join(d.name for d in self.not_configured))
@@ -284,15 +284,16 @@ def check_reachable(self, forced=False, test=False):
284284

285285
return all_ok
286286

287-
def check_status(self):
287+
def check_status_and_get_events(self):
288288
# pylint: disable=too-many-branches
289289
"""Get all the daemons status
290290
291291
292-
:return: Fictionary with all the daemons returned information
292+
:return: Dictionary with all the daemons returned information
293293
:rtype: dict
294294
"""
295295
statistics = {}
296+
events = []
296297
for daemon_link in self.all_daemons_links:
297298
if daemon_link == self.arbiter_link:
298299
# I exclude myself from the polling, sure I am reachable ;)
@@ -312,7 +313,15 @@ def check_status(self):
312313
except LinkError:
313314
logger.warning("Daemon connection failed, I could not get statistics.")
314315

315-
return statistics
316+
try:
317+
got = daemon_link.get_events()
318+
if got:
319+
events.extend(got)
320+
logger.debug("Daemon %s has %d events: %s", daemon_link.name, len(got), got)
321+
except LinkError:
322+
logger.warning("Daemon connection failed, I could not get events.")
323+
324+
return events
316325

317326
def check_dispatch(self):
318327
"""Check that all active satellites have a configuration dispatched

0 commit comments

Comments
 (0)