Skip to content

Commit a6997ae

Browse files
committed
Alignak overal statistics from the daemons - get problems / livesynthesis from the schedulers
1 parent bd8c53a commit a6997ae

18 files changed

+623
-232
lines changed

alignak/daemons/arbiterdaemon.py

+102-5
Original file line numberDiff line numberDiff line change
@@ -1151,17 +1151,22 @@ def daemons_reachability_check(self):
11511151
"""
11521152
# First look if it's not too early to ping
11531153
start = time.time()
1154-
if self.daemons_last_reachable_check \
1155-
and self.daemons_last_reachable_check + self.conf.daemons_check_period > start:
1154+
if self.daemons_last_reachable_check and \
1155+
self.daemons_last_reachable_check + self.conf.daemons_check_period > start:
11561156
logger.debug("Too early to check daemons reachability, check period is %.2f seconds",
11571157
self.conf.daemons_check_period)
11581158
return True
11591159

1160+
_t0 = time.time()
11601161
logger.debug("Alignak daemons reachability check")
1162+
self.dispatcher.check_reachable()
1163+
statsmgr.timer('dispatcher.check-alive', time.time() - _t0)
11611164

11621165
_t0 = time.time()
1163-
result = self.dispatcher.check_reachable()
1164-
statsmgr.timer('dispatcher.check-alive', time.time() - _t0)
1166+
logger.debug("Alignak daemons status get")
1167+
result = self.dispatcher.check_status()
1168+
statsmgr.timer('dispatcher.check-status', time.time() - _t0)
1169+
logger.debug("Getting daemons status duration: %.2f seconds", time.time() - _t0)
11651170

11661171
# Set the last check as now
11671172
self.daemons_last_reachable_check = start
@@ -1718,7 +1723,6 @@ def get_daemon_stats(self, details=False): # pylint: disable=too-many-branches
17181723
if getattr(self, 'dispatcher', None):
17191724
# Daemon properties that we are interested in
17201725
res['daemons_states'] = {}
1721-
state = 0
17221726
for satellite in self.dispatcher.all_daemons_links:
17231727
if satellite == self.link_to_myself:
17241728
continue
@@ -1758,6 +1762,99 @@ def get_daemon_stats(self, details=False): # pylint: disable=too-many-branches
17581762

17591763
return res
17601764

1765+
def get_monitoring_problems(self):
1766+
"""Get the schedulers satellites problems list
1767+
1768+
:return: problems dictionary
1769+
:rtype: dict
1770+
"""
1771+
res = self.get_id()
1772+
res['problems'] = {}
1773+
1774+
# Report our schedulers information, but only if a dispatcher exists
1775+
if getattr(self, 'dispatcher', None) is None:
1776+
return res
1777+
1778+
for satellite in self.dispatcher.all_daemons_links:
1779+
if satellite.type not in ['scheduler']:
1780+
continue
1781+
if not satellite.active:
1782+
continue
1783+
1784+
if 'problems' in satellite.statistics:
1785+
res['problems'][satellite.name] = {
1786+
'_freshness': satellite.statistics['_freshness'],
1787+
'problems': satellite.statistics['problems']
1788+
}
1789+
1790+
return res
1791+
1792+
def get_livesynthesis(self):
1793+
"""Get the schedulers satellites live synthesis
1794+
1795+
:return: compiled livesynthesis dictionary
1796+
:rtype: dict
1797+
"""
1798+
res = self.get_id()
1799+
res['livesynthesis'] = {
1800+
'_overall': {
1801+
'_freshness': int(time.time()),
1802+
'livesynthesis': {
1803+
'hosts_total': 0,
1804+
'hosts_not_monitored': 0,
1805+
'hosts_up_hard': 0,
1806+
'hosts_up_soft': 0,
1807+
'hosts_down_hard': 0,
1808+
'hosts_down_soft': 0,
1809+
'hosts_unreachable_hard': 0,
1810+
'hosts_unreachable_soft': 0,
1811+
'hosts_acknowledged': 0,
1812+
'hosts_in_downtime': 0,
1813+
'hosts_flapping': 0,
1814+
1815+
'services_total': 0,
1816+
'services_not_monitored': 0,
1817+
'services_ok_hard': 0,
1818+
'services_ok_soft': 0,
1819+
'services_warning_hard': 0,
1820+
'services_warning_soft': 0,
1821+
'services_critical_hard': 0,
1822+
'services_critical_soft': 0,
1823+
'services_unknown_hard': 0,
1824+
'services_unknown_soft': 0,
1825+
'services_unreachable_hard': 0,
1826+
'services_unreachable_soft': 0,
1827+
'services_acknowledged': 0,
1828+
'services_in_downtime': 0,
1829+
'services_flapping': 0,
1830+
}
1831+
}
1832+
}
1833+
1834+
# Report our schedulers information, but only if a dispatcher exists
1835+
if getattr(self, 'dispatcher', None) is None:
1836+
return res
1837+
1838+
for satellite in self.dispatcher.all_daemons_links:
1839+
if satellite.type not in ['scheduler']:
1840+
continue
1841+
if not satellite.active:
1842+
continue
1843+
1844+
if 'livesynthesis' in satellite.statistics:
1845+
# Scheduler detailed live synthesis
1846+
res['livesynthesis'][satellite.name] = {
1847+
'_freshness': satellite.statistics['_freshness'],
1848+
'livesynthesis': satellite.statistics['livesynthesis']
1849+
}
1850+
# Cumulated live synthesis
1851+
for prop in res['livesynthesis']['_overall']:
1852+
if prop in satellite.statistics['livesynthesis']:
1853+
res['livesynthesis']['_overall']['livesynthesis'][prop] += \
1854+
satellite.statistics['livesynthesis'][prop]
1855+
1856+
return res
1857+
17611858
def push_passive_check(self, details=False):
17621859
# pylint: disable=too-many-locals
17631860
"""Push the alignak overall state as a passive check

alignak/daemons/schedulerdaemon.py

+19
Original file line numberDiff line numberDiff line change
@@ -534,6 +534,25 @@ def get_daemon_stats(self, details=False):
534534

535535
return res
536536

537+
def get_monitoring_problems(self):
538+
"""Get the current scheduler livesynthesis
539+
540+
:return: live synthesis and problems dictionary
541+
:rtype: dict
542+
"""
543+
res = {}
544+
if not self.sched:
545+
return res
546+
547+
# Get statistics from the scheduler
548+
scheduler_stats = self.sched.get_scheduler_stats(details=True)
549+
if 'livesynthesis' in scheduler_stats:
550+
res['livesynthesis'] = scheduler_stats['livesynthesis']
551+
if 'livesynthesis' in scheduler_stats:
552+
res['problems'] = scheduler_stats['problems']
553+
554+
return res
555+
537556
def main(self):
538557
"""Main function for Scheduler, launch after the init::
539558

alignak/dispatcher.py

+10-15
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,7 @@ def check_reachable(self, forced=False, test=False):
204204
205205
If test parameter is True, do not really send but simulate only for testing purpose...
206206
207-
TODO: The update_infos function returns None when no ping has been executed
207+
The update_infos function returns None when no ping has been executed
208208
(too early...), or True / False according to the real ping and get managed
209209
configuration result. So, if the result is None, consider as not valid,
210210
else compute the global result...
@@ -246,7 +246,7 @@ def check_reachable(self, forced=False, test=False):
246246

247247
if result:
248248
# Got a managed configuration
249-
logger.debug("The %s %s manages %s",
249+
logger.debug("The %s '%s' manages %s",
250250
daemon_link.type, daemon_link.name, daemon_link.cfg_managed)
251251
if not self.first_dispatch_done:
252252
# I just (re)started the arbiter
@@ -259,8 +259,6 @@ def check_reachable(self, forced=False, test=False):
259259
daemon_link.type, daemon_link.name)
260260
# the daemon is not yet configured
261261
self.not_configured.append(daemon_link)
262-
# # Ask to wait for a new configuration
263-
# daemon_link.wait_new_conf()
264262
daemon_link.configuration_sent = False
265263
else:
266264
# Got a timeout !
@@ -288,16 +286,11 @@ def check_reachable(self, forced=False, test=False):
288286

289287
def check_status(self):
290288
# pylint: disable=too-many-branches
291-
"""Check all daemons state (reachable or not)
289+
"""Get all the daemons status
292290
293-
If test parameter is True, do not really send but simulate only for testing purpose...
294291
295-
TODO: The update_infos function returns None when no ping has been executed
296-
(too early...), or True / False according to the real ping and get managed
297-
configuration result. So, if the result is None, consider as not valid,
298-
else compute the global result...
299-
300-
:return: True if all daemons are reachable
292+
:return: Fictionary with all the daemons returned information
293+
:rtype: dict
301294
"""
302295
statistics = {}
303296
for daemon_link in self.all_daemons_links:
@@ -309,10 +302,12 @@ def check_status(self):
309302
# I exclude the daemons that are not active
310303
continue
311304

312-
result = None
313305
try:
314-
result = daemon_link.get_daemon_stats()
315-
statistics[daemon_link.name] = result
306+
daemon_link.statistics = daemon_link.get_daemon_stats(details=True)
307+
daemon_link.statistics['_freshness'] = int(time.time())
308+
statistics[daemon_link.name] = daemon_link.statistics
309+
logger.debug("Daemon %s statistics: %s" % (daemon_link.name,
310+
daemon_link.statistics))
316311
except LinkError:
317312
logger.warning("Daemon connection failed, I could not get statistics.")
318313

alignak/http/arbiter_interface.py

+150
Original file line numberDiff line numberDiff line change
@@ -372,3 +372,153 @@ def push_external_command(self, command=None):
372372
:return: None
373373
"""
374374
return self.command(command=command)
375+
376+
@cherrypy.expose
377+
@cherrypy.tools.json_out()
378+
def get_monitoring_problems(self):
379+
"""Get Alignak detailed monitoring status
380+
381+
This will return an object containing the properties of the `get_id`, plus a `problems`
382+
object which contains 2 properties for each known scheduler:
383+
- _freshness, which is the timestamp when the provided data were fetched
384+
- problems, which is an object with the scheduler known problems:
385+
386+
{
387+
...
388+
389+
"problems": {
390+
"scheduler-master": {
391+
"_freshness": 1528903945,
392+
"problems": {
393+
"fdfc986d-4ab4-4562-9d2f-4346832745e6": {
394+
"last_state": "CRITICAL",
395+
"service": "dummy_critical",
396+
"last_state_type": "SOFT",
397+
"last_state_update": 1528902442,
398+
"last_hard_state": "CRITICAL",
399+
"last_hard_state_change": 1528902442,
400+
"last_state_change": 1528902381,
401+
"state": "CRITICAL",
402+
"state_type": "HARD",
403+
"host": "host-all-8",
404+
"output": "Hi, checking host-all-8/dummy_critical -> exit=2"
405+
},
406+
"2445f2a3-2a3b-4b13-96ed-4cfb60790e7e": {
407+
"last_state": "WARNING",
408+
"service": "dummy_warning",
409+
"last_state_type": "SOFT",
410+
"last_state_update": 1528902463,
411+
"last_hard_state": "WARNING",
412+
"last_hard_state_change": 1528902463,
413+
"last_state_change": 1528902400,
414+
"state": "WARNING",
415+
"state_type": "HARD",
416+
"host": "host-all-6",
417+
"output": "Hi, checking host-all-6/dummy_warning -> exit=1"
418+
},
419+
...
420+
}
421+
}
422+
}
423+
}
424+
425+
:return: schedulers live synthesis list
426+
:rtype: dict
427+
"""
428+
res = self.get_id()
429+
res.update(self.get_start_time())
430+
res.update(self.app.get_monitoring_problems())
431+
return res
432+
433+
@cherrypy.expose
434+
@cherrypy.tools.json_out()
435+
def get_livesynthesis(self):
436+
"""Get Alignak live synthesis
437+
438+
This will return an object containing the properties of the `get_id`, plus a `livesynthesis`
439+
object which contains 2 properties for each known scheduler:
440+
- _freshness, which is the timestamp when the provided data were fetched
441+
- livesynthesis, which is an object with the scheduler live synthesis.
442+
443+
An `_overall` fake scheduler is also contained in the schedulers list to provide the
444+
cumulated live synthesis. Before sending the results, the arbiter sums-up all its
445+
schedulers live synthesis counters in the `_overall` live synthesis.
446+
447+
{
448+
...
449+
450+
"livesynthesis": {
451+
"_overall": {
452+
"_freshness": 1528947526,
453+
"livesynthesis": {
454+
"hosts_total": 11,
455+
"hosts_not_monitored": 0,
456+
"hosts_up_hard": 11,
457+
"hosts_up_soft": 0,
458+
"hosts_down_hard": 0,
459+
"hosts_down_soft": 0,
460+
"hosts_unreachable_hard": 0,
461+
"hosts_unreachable_soft": 0,
462+
"hosts_flapping": 0,
463+
"hosts_acknowledged": 0,
464+
"hosts_in_downtime": 0,
465+
"services_total": 100,
466+
"services_not_monitored": 0,
467+
"services_ok_hard": 70,
468+
"services_ok_soft": 0,
469+
"services_warning_hard": 4,
470+
"services_warning_soft": 6,
471+
"services_critical_hard": 6,
472+
"services_critical_soft": 4,
473+
"services_unknown_hard": 3,
474+
"services_unknown_soft": 7,
475+
"services_unreachable_hard": 0,
476+
"services_unreachable_soft": 0,
477+
"services_flapping": 0,
478+
"services_acknowledged": 0,
479+
"services_in_downtime": 0
480+
}
481+
}
482+
},
483+
"scheduler-master": {
484+
"_freshness": 1528947522,
485+
"livesynthesis": {
486+
"hosts_total": 11,
487+
"hosts_not_monitored": 0,
488+
"hosts_up_hard": 11,
489+
"hosts_up_soft": 0,
490+
"hosts_down_hard": 0,
491+
"hosts_down_soft": 0,
492+
"hosts_unreachable_hard": 0,
493+
"hosts_unreachable_soft": 0,
494+
"hosts_flapping": 0,
495+
"hosts_acknowledged": 0,
496+
"hosts_in_downtime": 0,
497+
"services_total": 100,
498+
"services_not_monitored": 0,
499+
"services_ok_hard": 70,
500+
"services_ok_soft": 0,
501+
"services_warning_hard": 4,
502+
"services_warning_soft": 6,
503+
"services_critical_hard": 6,
504+
"services_critical_soft": 4,
505+
"services_unknown_hard": 3,
506+
"services_unknown_soft": 7,
507+
"services_unreachable_hard": 0,
508+
"services_unreachable_soft": 0,
509+
"services_flapping": 0,
510+
"services_acknowledged": 0,
511+
"services_in_downtime": 0
512+
}
513+
}
514+
}
515+
}
516+
}
517+
518+
:return: scheduler live synthesis
519+
:rtype: dict
520+
"""
521+
res = self.get_id()
522+
res.update(self.get_start_time())
523+
res.update(self.app.get_livesynthesis())
524+
return res

alignak/http/generic_interface.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ def api(self):
6565
:rtype: dict
6666
"""
6767
functions = [x[0]for x in inspect.getmembers(self, predicate=inspect.ismethod)
68-
if not x[0].startswith('__')]
68+
if not x[0].startswith('_')]
6969

7070
full_api = {
7171
'doc': u"When posting data you have to use the JSON format.",

0 commit comments

Comments
 (0)