From 8c786d61906f0e644dee21835b4016d550c01f92 Mon Sep 17 00:00:00 2001 From: Daniele Date: Fri, 1 Apr 2022 00:02:23 +0200 Subject: [PATCH] First working version --- .gitignore | 1 + README.md | 57 +++++++++ healthcheck.cfg.example | 121 ++++++++++++++++++ healthcheck.cron.example | 6 + healthcheck.py | 263 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 448 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 healthcheck.cfg.example create mode 100644 healthcheck.cron.example create mode 100755 healthcheck.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3ae9683 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +healthcheck.cfg diff --git a/README.md b/README.md new file mode 100644 index 0000000..c23f58a --- /dev/null +++ b/README.md @@ -0,0 +1,57 @@ +# Selfhost utilities +A collection of utilities for self hosters. + +## HEALTHCHECK +A simple server health check. +Sends an email in case of alarm. +Meant to be run with a cron (see healthcheck.cron.example) +Tested on Debian 11, but should run on almost any standard linux box + +### Alarms +Provided ready-to-use alarms in config file: +- system load +- disk space +- raid status +- battery level / charger status (for laptops used as servers, apparently common among the self hosters) +- memory status +- cpu temperature (needs to be adapted as every system has a different name for the sensor) +- fan speed (needs to be adapted as every system has a different name for the sensor) + +### How does it work +The config file contains a list of checks. The most common checks are provided in the config file, but it is possible to configure custom checks, if needed. +Every check definition has: +- DISABLED: boolean, wether to run the check +- ALARM_VALUE_MORE_THAN: float, the alarm is issued if detected value exceeds the configured one +- ALARM_VALUE_LESS_THAN: float, the alarm is issued if detected value is less than the configured one +- ALARM_VALUE_EQUAL: float, the alarm is issued if detected value is equal to the configured one (the values are always compared as floats) +- ALARM_VALUE_NOT_EQUAL: float, the alarm is issued if detected value is not equal to the configured one (the values are always compared as floats) +- ALARM_STRING_EQUAL: string, the alarm is issued if detected value is equal to the configured one (the values are always compared as strings) +- ALARM_STRING_NOT_EQUAL: string, the alarm is issued if detected value is not equal to the configured one (the values are always compared as strings) +- COMMAND: the command to run to obtain the value +- REGEXP: a regular expression that will be executed on the command output and returns a single group that will be compared with ALARM_*. If omitted, the complete command output will be used for comparation. + +### Installation +Copy the script and the config file into the system to check: +``` +cp healthcheck.py /usr/local/bin/healthcheck.py +cp healthcheck.cfg.example /usr/local/etc/healthcheck.cfg +``` +Edit `/usr/local/etc/healthcheck.cfg` enabling the checks you need and configuring email settings. +Run `/usr/local/bin/healthcheck.py /usr/local/etc/healthcheck.cfg` to check it is working. If needed, change the config to make a check fail and see if the notification mail is delivered. +Now copy the cron file: +``` +cp healthcheck.cron.example /etc/cron.d/healthcheck +``` +For increased safety, edit the cron file placing your email address in MAILTO var to be notified in case of healthcheck.py catastrophic failure. + +Setup is now complete: the cron runs the script every minute and you will receive emails in case of failed checks. + +### Useful notes +#### Note on system load averages**: +As stated in the `uptime` command manual: +> System load averages is the average number of processes that are either in a runnable or uninterruptable state. A process in a runnable state is either using the CPU or waiting to use the CPU. A process in uninterruptable state is waiting for some I/O access, eg waiting for disk. The averages are taken over the three time intervals. Load averages are not normalized for the number of CPUs in a system, so a load average of 1 means a single CPU system is loaded all the time while on a 4 CPU system it means it was idle 75% of the time. + +#### Note on temperature and fan speed checks: +The check to run needs lm-sensors to be installed and configured. Check your distribution install guide. +The sensors have different name in every system, so you WILL need to adapt the configuration. + \ No newline at end of file diff --git a/healthcheck.cfg.example b/healthcheck.cfg.example new file mode 100644 index 0000000..2e880ab --- /dev/null +++ b/healthcheck.cfg.example @@ -0,0 +1,121 @@ +[DEFAULT] + +#### EMAIL NOTIFICATIONS + +# Notify this email address(es) in case of alarm, multiple addresses separated by commas +MAILTO=root@localhost, user@localhost + +# Sender address +MAILFROM=root@localhost + +# Use a remote SMTP host (enable by removing comment) +#SMTPHOST=my.smtp.host:465 + +# SMTP credentials +#SMTPUSER=mysmtpuser +#SMTPPASS=mysmtppass + +# Use SSL for SMTP +#SMTPSSL=True + + + +#### HEALTH CHECKS #### +# Every health check is based on a command being executed, its result being parsed with a regexp +# to extract (as a single group) the numeric or string value, and the value being compared with +# a configured value. This checks are ready to be used, just enable the ones you need. +# You can add your own custom check declaring another section like this: +# +# [my_custom_check_name] +# DISABLED=False +# ALARM_STRING_EQUAL=Lorem ipsum +# ALARM_STRING_NOT_EQUAL=The lazy fox +# ALARM_VALUE_EQUAL=99 +# ALARM_VALUE_NOT_EQUAL=76.365338 +# ALARM_VALUE_MORE_THAN=1.0 +# ALARM_VALUE_LESS_THAN=12 +# COMMAND=/my/custom/binary --with parameters +# REGEXP=my regex to parse (awesome|disappointing) command output + +[system_load_1min] +# The system load average in the last minute +DISABLED=True +ALARM_VALUE_MORE_THAN=1.0 +COMMAND=uptime +REGEXP=.*load average: (\d+[,.]\d+), \d+[,.]\d+, \d+[,.]\d+ + +[system_load_5min] +# The system load average in the last 5 minutes +DISABLED=True +ALARM_VALUE_MORE_THAN=1.0 +COMMAND=uptime +REGEXP=.*load average: \d+[,.]\d+, (\d+[,.]\d+), \d+[,.]\d+ + +[system_load_15min] +# The system load average in the last 15 minutes +DISABLED=True +ALARM_VALUE_MORE_THAN=1.0 +COMMAND=uptime +REGEXP=.*load average: \d+[,.]\d+, \d+[,.]\d+, (\d+[,.]\d+) + +[used_disk_space] +# Used disk space (in percent, i.e. ALARM_VALUE_MORE_THAN=75 -> alarm if disk is more than 75% full) +DISABLED=True +ALARM_VALUE_MORE_THAN=75 +COMMAND=df -h /dev/sda1 +REGEXP=(\d{1,3})% + +[raid_status] +# Issues an alarm when the raid is corrupted +# Checks this part of the /proc/mdstat file: +# 243553280 blocks super 1.2 [2/2] [UU] +# If the content of the last [ ] contains only U (without _), the raid array is healty +# Otherwise, [U_] or [_U] is displayed (may contain more U or _ if the array is more disks) +DISABLED=True +ALARM_STRING_NOT_EQUAL=UU +COMMAND=cat /proc/mdstat +REGEXP=.*\] \[([U_]+)\]\n + +[battery_level] +# Issues an alarm when battery is discharging below a certain level (long blackout, pulled power cord...) +# For laptops used as servers, apparently common among the self hosters. Requires acpi package installed. +# Value is in % +DISABLED=True +COMMAND=acpi -b +REGEXP=Battery \d: .*, (\d{1,3})% +ALARM_VALUE_LESS_THAN=90 + +[laptop_charger_disconnected] +# Issues an alarm when laptop charger is disconnected +# For laptops used as servers, apparently common among the self hosters. Requires acpi package installed. +DISABLED=True +COMMAND=acpi -a +REGEXP=Adapter \d: (.+) +ALARM_STRING_NOT_EQUAL=on-line + +[free_ram] +# Free ram in % +# Shows another approach: does all the computation in the command and picks up +# all the output (by not declaring a regexp). +DISABLED=True +COMMAND=free | grep Mem | awk '{print int($4/$2 * 100.0)}' +ALARM_VALUE_LESS_THAN=20 + +[cpu_temperature] +# CPU Temperature alarm: requires lm-sensors installed and configured (check your distribution's guide) +# The regexp must be adapted to your configuration: run `sensors` in the command line +# to find the name of the temperature sensor in your system. In this case is `Core 0`, +# but may be called Tdie or a lot of different names, there is no standard. +DISABLED=True +ALARM_VALUE_MORE_THAN=80 +COMMAND=sensors +REGEXP=Core 0: +\+?(-?\d{1,3}).\d°[CF] + +[fan_speed] +# Fan speed alarm: requires lm-sensors installed and configured (check your distribution's guide) +# The regexp must be adapted to your configuration: run `sensors` in the command line +# to find the name of the fan speed sensor in your system. +DISABLED=True +ALARM_VALUE_LESS_THAN=300 +COMMAND=sensors +REGEXP=cpu_fan: +(\d) RPM diff --git a/healthcheck.cron.example b/healthcheck.cron.example new file mode 100644 index 0000000..5468ab4 --- /dev/null +++ b/healthcheck.cron.example @@ -0,0 +1,6 @@ +# Cron to execute health checks +# As a security measure, the address in MAILTO will be notified +# if the healthcheck.py script crashes. + +MAILTO="your-email-address" +* * * * * root /usr/local/bin/healthcheck.py /usr/local/etc/healthcheck.cfg -q diff --git a/healthcheck.py b/healthcheck.py new file mode 100755 index 0000000..50330ce --- /dev/null +++ b/healthcheck.py @@ -0,0 +1,263 @@ +#!/usr/bin/env python3 + +""" @package docstring +Self host healthcheck + +A simple server monitoring software: ends an email in case of alarm. + +Installation: +- Copy healthcheck.cfg in /usr/local/etc/healthcheck.cfg and customize it +- Copy healthcheck.py in /usr/local/bin/healthcheck.py + +Usage: +Place a cron entry like this one: + +* * * * * root python3 /usr/local/bin/healthcheck.py /usr/local/etc/healthcheck.cfg + +The script will print current values and issue an alarm sending a mail if any of the values +exceeds the limit configured in healthcheck.cfg + +@author Daniele Verducci + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. +You should have received a copy of the GNU General Public License +along with this program. If not, see . +""" + +import os +import sys +import logging +import traceback +import subprocess +import configparser +import time +from email.mime.multipart import MIMEMultipart +from email.mime.application import MIMEApplication +from email.mime.text import MIMEText +import smtplib +import socket +import getpass +import re +import locale + + +NAME = 'healthcheck' +VERSION = '0.1' +DESCRIPTION = 'A simple server monitoring software' +EMAIL_SUBJECT_TPL = 'Host {} failed health check for {}' +EMAIL_MESSAGE_TPL = 'Alarm for sensor {} on host {} on {}: {}' + +class Main: + + def __init__(self, configPath): + ''' Sets up locale (needed for parsing numbers) ''' + # Get system locale from $LANG (i.e. "en_GB.UTF-8") + systemLocale = os.getenv('LANG') + if not systemLocale: + raise ValueError('System environment variabile $LANG is not set!') + + locale.setlocale(locale.LC_ALL, systemLocale) + + ''' Reads the config ''' + self._log = logging.getLogger('main') + + if not os.path.exists(configPath) or not os.path.isfile(configPath): + raise ValueError('configPath must be a file') + + self.config = configparser.ConfigParser(interpolation=None) # Disable interpolation because contains regexp + self.config.read(configPath) + + def run(self, dryRun): + ''' Runs the healtg checks ''' + + for section in self.config: + if section == 'DEFAULT': + continue + + s = Settings(section, self.config) + if s.disabled: + self._log.info('Ignoring disabled check "{}"'.format(section)) + continue + + self._log.info('Checking "{}"'.format(section)) + + error = self.check(s) + if error: + # Alarm! + logging.warning('Alarm for {}: {}!'.format(section, error)) + if not dryRun: + self.sendMail(s, error) + + # Calls the provided command, checks the value parsing it with the provided regexp + # and returns an error string, or null if the value is within its limits + def check(self, config): + # Check config + if not config.command: + return "bad config: COMMAND is mandatory" + if not config.regexp: + return "bad config: REGEXP is mandatory" + + # Run command + stdout = "" + ret = subprocess.run(config.command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) + if ret.stderr: + self._log.info('{} subprocess stderr:\n{}', config.command, ret.stderr.decode()) + if ret.stdout: + stdout = ret.stdout.decode() + self._log.debug('{} subprocess stdout:\n{}', config.command, stdout) + if ret.returncode != 0: + return 'subprocess {} exited with error code {}'.format(config.command, ret.returncode) + + # Parse result with regex + match = re.search(config.regexp, stdout, re.MULTILINE) + if not match: + return 'regexp didn\'t match anything' + groups = match.groups() + if len(groups) != 1: + return 'regexp returns {} groups (expected exactly 1 group)'.format(len(matches)) + detectedValue = groups[0] + + # Compare detected value with equal, not equal, more than and less values + logging.info('detected {}'.format(detectedValue)) + if config.alarm_string_equal and (detectedValue == config.alarm_string_equal): + return 'value is "{}"'.format(detectedValue) + if config.alarm_string_not_equal and (detectedValue != config.alarm_string_not_equal): + return 'value is "{}", but should be "{}"'.format(detectedValue, config.alarm_string_not_equal) + if config.alarm_value_equal and (locale.atof(detectedValue) == float(config.alarm_value_equal)): + return 'value is {}'.format(detectedValue) + if config.alarm_value_not_equal and (locale.atof(detectedValue) != float(config.alarm_value_not_equal)): + return 'value is {}, but should be {}'.format(detectedValue, config.alarm_value_not_equal) + if config.alarm_value_more_than and locale.atof(detectedValue) > float(config.alarm_value_more_than): + return 'value is {}, but should not exceed {}'.format(locale.atof(detectedValue), config.alarm_value_more_than) + if config.alarm_value_less_than and locale.atof(detectedValue) < float(config.alarm_value_less_than): + return 'value is {}, but should be greater than {}'.format(locale.atof(detectedValue), config.alarm_value_less_than) + + + def sendMail(self, s, error): + if s.smtphost: + logging.info("Sending detailed logs to %s via %s", s.mailto, s.smtphost) + else: + logging.info("Sending detailed logs to %s using local smtp", s.mailto) + + # Create main message + hostname = os.uname()[1] + msg = MIMEMultipart() + msg['Subject'] = EMAIL_SUBJECT_TPL.format(hostname, s.name) + if s.mailfrom: + m_from = s.mailfrom + else: + m_from = s.username + "@" + s.hostname + msg['From'] = m_from + msg['To'] = ', '.join(s.mailto) + msg.preamble = 'This is a multi-part message in MIME format.' + + # Add base text + body = EMAIL_MESSAGE_TPL.format( + s.name, + hostname, + time.strftime("%a, %d %b %Y %H:%M:%S"), + error + ) + txt = MIMEText(body) + msg.attach(txt) + + # Send the message + if s.smtpssl and s.smtphost: + smtp = smtplib.SMTP_SSL(s.smtphost, timeout=300) + else: + smtp = smtplib.SMTP(timeout=300) + + if s.smtphost: + smtp.connect(s.smtphost) + else: + smtp.connect() + if s.smtpuser or s.smtppass: + smtp.login(s.smtpuser, s.smtppass) + smtp.sendmail(m_from, s.mailto, msg.as_string()) + smtp.quit() + + +class Settings: + ''' Represents settings for a check ''' + + EMAIL_LIST_SEP = ',' + + def __init__(self, name, config): + self.config = config + self.hostname = socket.getfqdn() + self.username = getpass.getuser() + + ## Check name + self.name = name + ## Disabled + self.disabled = self.getBoolean(name, 'DISABLED', False) + ## Email server connection data + self.smtphost = self.getStr(name, 'SMTPHOST', None) + self.smtpuser = self.getStr(name, 'SMTPUSER', None) + self.smtppass = self.getStr(name, 'SMTPPASS', None) + self.smtpssl = self.getBoolean(name, 'SMTPSSL', False) + ## List of email address to notify about backup status (mandatory) + mailtoList = config.get(name, 'MAILTO') + self.mailto = [ x.strip() for x in mailtoList.strip().split(self.EMAIL_LIST_SEP) ] + ## Sender address for the notification email + self.mailfrom = self.getStr(name, 'MAILFROM', getpass.getuser()+'@'+socket.gethostname()) + ## Values to compare + self.alarm_string_equal = self.getStr(name, 'ALARM_STRING_EQUAL', None) + self.alarm_string_not_equal = self.getStr(name, 'ALARM_STRING_NOT_EQUAL', None) + self.alarm_value_equal = self.getStr(name, 'ALARM_VALUE_EQUAL', None) + self.alarm_value_not_equal = self.getStr(name, 'ALARM_VALUE_NOT_EQUAL', None) + self.alarm_value_more_than = self.getStr(name, 'ALARM_VALUE_MORE_THAN', None) + self.alarm_value_less_than = self.getStr(name, 'ALARM_VALUE_LESS_THAN', None) + ## Command to obtain the value for comparation + self.command = self.getStr(name, 'COMMAND', None) + ## Regexp to extract value from command output (default to match full string) + self.regexp = self.getStr(name, 'REGEXP', '(.*)') + + def getStr(self, name, key, defaultValue): + try: + return self.config.get(name, key) + except configparser.NoOptionError: + return defaultValue + + def getBoolean(self, name, key, defaultValue): + try: + return self.config.getboolean(name, key) + except configparser.NoOptionError: + return defaultValue + + +if __name__ == '__main__': + import argparse + + parser = argparse.ArgumentParser( + prog = NAME + '.py', + description = NAME + ' ' + VERSION + '\n' + DESCRIPTION, + formatter_class = argparse.RawTextHelpFormatter + ) + parser.add_argument('configFile', help="configuration file path") + parser.add_argument('-q', '--quiet', action='store_true', help="suppress non-essential output") + parser.add_argument('-d', '--dry-run', action='store_true', help="do not send emails or execute completion script") + args = parser.parse_args() + + if args.quiet: + level = logging.WARNING + else: + level = logging.INFO + logging.basicConfig(level=level) + + try: + main = Main(args.configFile) + main.run(args.dry_run) + except Exception as e: + logging.critical(traceback.format_exc()) + print('ERROR: {}'.format(e)) + sys.exit(1) + + sys.exit(0)