Notification policy

This commit is contained in:
Daniele Verducci (Slimpenguin) 2022-04-15 08:23:40 +02:00
parent e24383728a
commit cd146018de
2 changed files with 130 additions and 17 deletions

View File

@ -1,3 +1,5 @@
# The DEFAULT section contains the global configuration applied to all checks.
# You can re-define this variables in a check to override the global one.
[DEFAULT] [DEFAULT]
#### EMAIL NOTIFICATIONS #### #### EMAIL NOTIFICATIONS ####
@ -39,6 +41,21 @@ MAILTO=root@localhost, user@localhost
#ALARM_COMMAND=curl -H "%%CHECKNAME%% alarm on %%HOSTNAME%%" -d "%%ERROR%% on %%DATETIME%%" ntfy.sh/my-unique-topic-name #ALARM_COMMAND=curl -H "%%CHECKNAME%% alarm on %%HOSTNAME%%" -d "%%ERROR%% on %%DATETIME%%" ntfy.sh/my-unique-topic-name
#### NOTIFICATION POLICY ###
# Defines when to send the email and/or execute ALARM_COMMAND. Useful to avoid email flooding.
# Possible values:
# EVERY_RUN In case of alarm, sends a mail every time the script is run
# START Sends a mail only when an alarm starts
# ONCE_IN_MINUTES In case of alarm, resends a mail only if NOTIFY_MINUTES has passed
NOTIFY=EVERY_RUN
# Used only if NOTIFY=ONCE_IN_MINUTES. A mail is sent only if NOTIFY_MINUTES has passed from the previous one
NOTIFY_MINUTES=60
# Sends a mail when the alarm has ended
NOTIFY_ALARM_END=TRUE
#### HEALTH CHECKS #### #### HEALTH CHECKS ####
# Every health check is based on a command being executed, its result being parsed with a regexp # Every health check is based on a command being executed, its result being parsed with a regexp
# to extract (as a single group) the numeric or string value, and the value being compared with # to extract (as a single group) the numeric or string value, and the value being compared with
@ -190,3 +207,10 @@ COMMAND=ping 192.168.1.123 -c 1 > /dev/null && echo "Online" || echo "Offline"
DISABLED=True DISABLED=True
ALARM_STRING_NOT_EQUAL=Online ALARM_STRING_NOT_EQUAL=Online
COMMAND=nc -z -w 3 192.168.1.123 80 > /dev/null && echo "Online" || echo "Offline" COMMAND=nc -z -w 3 192.168.1.123 80 > /dev/null && echo "Online" || echo "Offline"
[dummy_always_alarm]
# A dummy check that is always in alarm. Useful for testing notifications.
DISABLED=True
ALARM_STRING_NOT_EQUAL=Core meltdown!
COMMAND=echo "Core meltdown!"

View File

@ -46,13 +46,18 @@ import socket
import getpass import getpass
import re import re
import locale import locale
import json
NAME = 'healthcheck' NAME = 'healthcheck'
VERSION = '0.1' VERSION = '0.1'
DESCRIPTION = 'A simple server monitoring software' DESCRIPTION = 'A simple server monitoring software'
EMAIL_SUBJECT_TPL = 'Host {} failed health check for {}' EMAIL_START_SUBJECT_TPL = '{}: {} health alarm!'
EMAIL_MESSAGE_TPL = 'Alarm for sensor {} on host {} on {}: {}' EMAIL_START_MESSAGE_TPL = 'Alarm for sensor {} on host {} on {}: {}'
EMAIL_END_SUBJECT_TPL = '{}: {} OK'
EMAIL_END_MESSAGE_TPL = 'Alarm ceased for sensor {} on host {} on {}'
# Healthcheck saves the current status (alarms triggered, last run... in this file)
STATUS_FILE = '/tmp/healthcheck.tmp'
class Main: class Main:
@ -78,6 +83,10 @@ class Main:
def run(self, dryRun): def run(self, dryRun):
''' Runs the health checks ''' ''' Runs the health checks '''
# Load status
status = Status()
# Run checks based o the config
for section in self.config: for section in self.config:
if section == 'DEFAULT': if section == 'DEFAULT':
continue continue
@ -85,6 +94,7 @@ class Main:
s = Settings(section, self.config) s = Settings(section, self.config)
if s.disabled: if s.disabled:
self._log.info('Ignoring disabled check "{}"'.format(section)) self._log.info('Ignoring disabled check "{}"'.format(section))
status.unsetAlarm(section)
continue continue
self._log.info('Checking "{}"'.format(section)) self._log.info('Checking "{}"'.format(section))
@ -93,11 +103,37 @@ class Main:
if error: if error:
# Alarm! # Alarm!
logging.warning('Alarm for {}: {}!'.format(section, error)) logging.warning('Alarm for {}: {}!'.format(section, error))
if not dryRun: if self.shouldNotify(section, s, status):
if s.mailto: status.setAlarm(section)
self.sendMail(s, error) if not dryRun:
if s.alarmCommand: if s.mailto:
self.executeAlarmCommand(s, error) self.sendAlmStartMail(s, error)
if s.alarmCommand:
self.executeAlarmCommand(s, error)
elif status.getAlarmTriggeredTimestamp(section) is not None:
logging.info('Alarm ceased for {}: OK!'.format(section))
if s.notify_alarm_end:
self.sendAlmEndMail(s)
status.unsetAlarm(section)
# Save updated status
status.save()
def shouldNotify(self, section, settings, status):
almTriggeredTime = status.getAlarmTriggeredTimestamp(section)
# Notify if alarm just started
if almTriggeredTime is None:
return True
# Notify if NOTIFY=EVERY_RUN
if settings.notify == 'EVERY_RUN':
return True
# Notify if time elapsed
if settings.notify == 'ONCE_IN_MINUTES' and (time.time() - almTriggeredTime) > (settings.notify_minutes * 60):
return True
return False
# Calls the provided command, checks the value parsing it with the provided regexp # Calls the provided command, checks the value parsing it with the provided regexp
# and returns an error string, or null if the value is within its limits # and returns an error string, or null if the value is within its limits
@ -146,15 +182,34 @@ class Main:
if config.alarm_value_less_than and locale.atof(detectedValue) < float(config.alarm_value_less_than): if config.alarm_value_less_than and locale.atof(detectedValue) < float(config.alarm_value_less_than):
return 'value is {}, but should be greater than {}'.format(locale.atof(detectedValue), config.alarm_value_less_than) return 'value is {}, but should be greater than {}'.format(locale.atof(detectedValue), config.alarm_value_less_than)
def sendMail(self, s, error): def sendAlmStartMail(self, s, error):
subject = EMAIL_START_SUBJECT_TPL.format(self.hostname, s.name)
body = EMAIL_START_MESSAGE_TPL.format(
s.name,
self.hostname,
time.strftime("%a, %d %b %Y %H:%M:%S"),
error
)
self.sendMail(s, subject, body)
def sendAlmEndMail(self, s):
subject = EMAIL_END_SUBJECT_TPL.format(self.hostname, s.name)
body = EMAIL_END_MESSAGE_TPL.format(
s.name,
self.hostname,
time.strftime("%a, %d %b %Y %H:%M:%S")
)
self.sendMail(s, subject, body)
def sendMail(self, s, subject, body):
if s.smtphost: if s.smtphost:
logging.info("Sending alarm email to %s via %s", s.mailto, s.smtphost) logging.info("Sending email to %s via %s", s.mailto, s.smtphost)
else: else:
logging.info("Sending alarm email to %s using local smtp", s.mailto) logging.info("Sending email to %s using local smtp", s.mailto)
# Create main message # Create main message
msg = MIMEMultipart() msg = MIMEMultipart()
msg['Subject'] = EMAIL_SUBJECT_TPL.format(self.hostname, s.name) msg['Subject'] = subject
if s.mailfrom: if s.mailfrom:
m_from = s.mailfrom m_from = s.mailfrom
else: else:
@ -164,12 +219,6 @@ class Main:
msg.preamble = 'This is a multi-part message in MIME format.' msg.preamble = 'This is a multi-part message in MIME format.'
# Add base text # Add base text
body = EMAIL_MESSAGE_TPL.format(
s.name,
self.hostname,
time.strftime("%a, %d %b %Y %H:%M:%S"),
error
)
txt = MIMEText(body) txt = MIMEText(body)
msg.attach(txt) msg.attach(txt)
@ -207,6 +256,34 @@ class Main:
self._log.error('subprocess {} exited with error code {}'.format(cmdToRun, ret.returncode)) self._log.error('subprocess {} exited with error code {}'.format(cmdToRun, ret.returncode))
class Status:
''' Represents the current status (alarms triggered, last run...) '''
def __init__(self):
try:
with open(STATUS_FILE, 'r') as openfile:
self.status = json.load(openfile)
except FileNotFoundError:
self.status = {
'lastRun': 0, # unix time in seconds
'alarms': {}, # key-value, alarmName : alarmTriggeredTimestamp
}
def save(self):
self.status['lastRun'] = time.time()
jo = json.dumps(self.status)
with open(STATUS_FILE, "w") as outfile:
outfile.write(jo)
def setAlarm(self, almName):
self.status['alarms'][almName] = time.time()
def unsetAlarm(self, almName):
self.status['alarms'].pop(almName, None)
def getAlarmTriggeredTimestamp(self, almName):
return self.status['alarms'].get(almName, None)
class Settings: class Settings:
''' Represents settings for a check ''' ''' Represents settings for a check '''
@ -244,6 +321,10 @@ class Settings:
self.alarm_value_not_equal = self.getStr(name, 'ALARM_VALUE_NOT_EQUAL', None) self.alarm_value_not_equal = self.getStr(name, 'ALARM_VALUE_NOT_EQUAL', None)
self.alarm_value_more_than = self.getStr(name, 'ALARM_VALUE_MORE_THAN', None) self.alarm_value_more_than = self.getStr(name, 'ALARM_VALUE_MORE_THAN', None)
self.alarm_value_less_than = self.getStr(name, 'ALARM_VALUE_LESS_THAN', None) self.alarm_value_less_than = self.getStr(name, 'ALARM_VALUE_LESS_THAN', None)
## Notification policy
self.notify = self.getEnum(name, 'NOTIFY', 'EVERY_RUN', ['EVERY_RUN', 'START', 'ONCE_IN_MINUTES'])
self.notify_minutes = self.getInt(name, 'NOTIFY_MINUTES', 0)
self.notify_alarm_end = self.getBoolean(name, 'NOTIFY_ALARM_END', True)
## Command to obtain the value for comparation ## Command to obtain the value for comparation
self.command = self.getStr(name, 'COMMAND', None) self.command = self.getStr(name, 'COMMAND', None)
## Regexp to extract value from command output (default to match full string) ## Regexp to extract value from command output (default to match full string)
@ -255,12 +336,20 @@ class Settings:
except configparser.NoOptionError: except configparser.NoOptionError:
return defaultValue return defaultValue
def getInt(self, name, key, defaultValue):
return int(self.getStr(name, key, defaultValue))
def getBoolean(self, name, key, defaultValue): def getBoolean(self, name, key, defaultValue):
try: try:
return self.config.getboolean(name, key) return self.config.getboolean(name, key)
except configparser.NoOptionError: except configparser.NoOptionError:
return defaultValue return defaultValue
def getEnum(self, name, key, defaultValue, values):
val = self.getStr(name, key, defaultValue)
if not val in values:
raise ValueError("Invalid value {} for configuration {}: expected one of {}".format(val, key, ', '.join(values)))
return val
if __name__ == '__main__': if __name__ == '__main__':