diff --git a/healthcheck/healthcheck.cfg.example b/healthcheck/healthcheck.cfg.example index 3c76d29..6df75d4 100644 --- a/healthcheck/healthcheck.cfg.example +++ b/healthcheck/healthcheck.cfg.example @@ -1,3 +1,5 @@ +# The DEFAULT section contains the global configuration applied to all checks. +# You can re-define this variables in a check to override the global one. [DEFAULT] #### EMAIL NOTIFICATIONS #### @@ -39,6 +41,21 @@ MAILTO=root@localhost, user@localhost #ALARM_COMMAND=curl -H "%%CHECKNAME%% alarm on %%HOSTNAME%%" -d "%%ERROR%% on %%DATETIME%%" ntfy.sh/my-unique-topic-name +#### NOTIFICATION POLICY ### +# Defines when to send the email and/or execute ALARM_COMMAND. Useful to avoid email flooding. +# Possible values: +# EVERY_RUN In case of alarm, sends a mail every time the script is run +# START Sends a mail only when an alarm starts +# ONCE_IN_MINUTES In case of alarm, resends a mail only if NOTIFY_MINUTES has passed +NOTIFY=EVERY_RUN + +# Used only if NOTIFY=ONCE_IN_MINUTES. A mail is sent only if NOTIFY_MINUTES has passed from the previous one +NOTIFY_MINUTES=60 + +# Sends a mail when the alarm has ended +NOTIFY_ALARM_END=TRUE + + #### HEALTH CHECKS #### # Every health check is based on a command being executed, its result being parsed with a regexp # to extract (as a single group) the numeric or string value, and the value being compared with @@ -190,3 +207,10 @@ COMMAND=ping 192.168.1.123 -c 1 > /dev/null && echo "Online" || echo "Offline" DISABLED=True ALARM_STRING_NOT_EQUAL=Online COMMAND=nc -z -w 3 192.168.1.123 80 > /dev/null && echo "Online" || echo "Offline" + + +[dummy_always_alarm] +# A dummy check that is always in alarm. Useful for testing notifications. +DISABLED=True +ALARM_STRING_NOT_EQUAL=Core meltdown! +COMMAND=echo "Core meltdown!" diff --git a/healthcheck/healthcheck.py b/healthcheck/healthcheck.py index 05a88c9..2e872df 100755 --- a/healthcheck/healthcheck.py +++ b/healthcheck/healthcheck.py @@ -46,13 +46,18 @@ import socket import getpass import re import locale +import json NAME = 'healthcheck' VERSION = '0.1' DESCRIPTION = 'A simple server monitoring software' -EMAIL_SUBJECT_TPL = 'Host {} failed health check for {}' -EMAIL_MESSAGE_TPL = 'Alarm for sensor {} on host {} on {}: {}' +EMAIL_START_SUBJECT_TPL = '{}: {} health alarm!' +EMAIL_START_MESSAGE_TPL = 'Alarm for sensor {} on host {} on {}: {}' +EMAIL_END_SUBJECT_TPL = '{}: {} OK' +EMAIL_END_MESSAGE_TPL = 'Alarm ceased for sensor {} on host {} on {}' +# Healthcheck saves the current status (alarms triggered, last run... in this file) +STATUS_FILE = '/tmp/healthcheck.tmp' class Main: @@ -78,6 +83,10 @@ class Main: def run(self, dryRun): ''' Runs the health checks ''' + # Load status + status = Status() + + # Run checks based o the config for section in self.config: if section == 'DEFAULT': continue @@ -85,6 +94,7 @@ class Main: s = Settings(section, self.config) if s.disabled: self._log.info('Ignoring disabled check "{}"'.format(section)) + status.unsetAlarm(section) continue self._log.info('Checking "{}"'.format(section)) @@ -93,11 +103,37 @@ class Main: if error: # Alarm! logging.warning('Alarm for {}: {}!'.format(section, error)) - if not dryRun: - if s.mailto: - self.sendMail(s, error) - if s.alarmCommand: - self.executeAlarmCommand(s, error) + if self.shouldNotify(section, s, status): + status.setAlarm(section) + if not dryRun: + if s.mailto: + self.sendAlmStartMail(s, error) + if s.alarmCommand: + self.executeAlarmCommand(s, error) + elif status.getAlarmTriggeredTimestamp(section) is not None: + logging.info('Alarm ceased for {}: OK!'.format(section)) + if s.notify_alarm_end: + self.sendAlmEndMail(s) + status.unsetAlarm(section) + + # Save updated status + status.save() + + def shouldNotify(self, section, settings, status): + almTriggeredTime = status.getAlarmTriggeredTimestamp(section) + # Notify if alarm just started + if almTriggeredTime is None: + return True + + # Notify if NOTIFY=EVERY_RUN + if settings.notify == 'EVERY_RUN': + return True + + # Notify if time elapsed + if settings.notify == 'ONCE_IN_MINUTES' and (time.time() - almTriggeredTime) > (settings.notify_minutes * 60): + return True + + return False # Calls the provided command, checks the value parsing it with the provided regexp # and returns an error string, or null if the value is within its limits @@ -146,15 +182,34 @@ class Main: if config.alarm_value_less_than and locale.atof(detectedValue) < float(config.alarm_value_less_than): return 'value is {}, but should be greater than {}'.format(locale.atof(detectedValue), config.alarm_value_less_than) - def sendMail(self, s, error): + def sendAlmStartMail(self, s, error): + subject = EMAIL_START_SUBJECT_TPL.format(self.hostname, s.name) + body = EMAIL_START_MESSAGE_TPL.format( + s.name, + self.hostname, + time.strftime("%a, %d %b %Y %H:%M:%S"), + error + ) + self.sendMail(s, subject, body) + + def sendAlmEndMail(self, s): + subject = EMAIL_END_SUBJECT_TPL.format(self.hostname, s.name) + body = EMAIL_END_MESSAGE_TPL.format( + s.name, + self.hostname, + time.strftime("%a, %d %b %Y %H:%M:%S") + ) + self.sendMail(s, subject, body) + + def sendMail(self, s, subject, body): if s.smtphost: - logging.info("Sending alarm email to %s via %s", s.mailto, s.smtphost) + logging.info("Sending email to %s via %s", s.mailto, s.smtphost) else: - logging.info("Sending alarm email to %s using local smtp", s.mailto) + logging.info("Sending email to %s using local smtp", s.mailto) # Create main message msg = MIMEMultipart() - msg['Subject'] = EMAIL_SUBJECT_TPL.format(self.hostname, s.name) + msg['Subject'] = subject if s.mailfrom: m_from = s.mailfrom else: @@ -164,12 +219,6 @@ class Main: msg.preamble = 'This is a multi-part message in MIME format.' # Add base text - body = EMAIL_MESSAGE_TPL.format( - s.name, - self.hostname, - time.strftime("%a, %d %b %Y %H:%M:%S"), - error - ) txt = MIMEText(body) msg.attach(txt) @@ -207,6 +256,34 @@ class Main: self._log.error('subprocess {} exited with error code {}'.format(cmdToRun, ret.returncode)) +class Status: + ''' Represents the current status (alarms triggered, last run...) ''' + + def __init__(self): + try: + with open(STATUS_FILE, 'r') as openfile: + self.status = json.load(openfile) + except FileNotFoundError: + self.status = { + 'lastRun': 0, # unix time in seconds + 'alarms': {}, # key-value, alarmName : alarmTriggeredTimestamp + } + + def save(self): + self.status['lastRun'] = time.time() + jo = json.dumps(self.status) + with open(STATUS_FILE, "w") as outfile: + outfile.write(jo) + + def setAlarm(self, almName): + self.status['alarms'][almName] = time.time() + + def unsetAlarm(self, almName): + self.status['alarms'].pop(almName, None) + + def getAlarmTriggeredTimestamp(self, almName): + return self.status['alarms'].get(almName, None) + class Settings: ''' Represents settings for a check ''' @@ -244,6 +321,10 @@ class Settings: self.alarm_value_not_equal = self.getStr(name, 'ALARM_VALUE_NOT_EQUAL', None) self.alarm_value_more_than = self.getStr(name, 'ALARM_VALUE_MORE_THAN', None) self.alarm_value_less_than = self.getStr(name, 'ALARM_VALUE_LESS_THAN', None) + ## Notification policy + self.notify = self.getEnum(name, 'NOTIFY', 'EVERY_RUN', ['EVERY_RUN', 'START', 'ONCE_IN_MINUTES']) + self.notify_minutes = self.getInt(name, 'NOTIFY_MINUTES', 0) + self.notify_alarm_end = self.getBoolean(name, 'NOTIFY_ALARM_END', True) ## Command to obtain the value for comparation self.command = self.getStr(name, 'COMMAND', None) ## Regexp to extract value from command output (default to match full string) @@ -255,12 +336,20 @@ class Settings: except configparser.NoOptionError: return defaultValue + def getInt(self, name, key, defaultValue): + return int(self.getStr(name, key, defaultValue)) + def getBoolean(self, name, key, defaultValue): try: return self.config.getboolean(name, key) except configparser.NoOptionError: return defaultValue + def getEnum(self, name, key, defaultValue, values): + val = self.getStr(name, key, defaultValue) + if not val in values: + raise ValueError("Invalid value {} for configuration {}: expected one of {}".format(val, key, ', '.join(values))) + return val if __name__ == '__main__':