1 Commits
v0.2 ... devel

Author SHA1 Message Date
c729e434ac WIP Dashboard 2022-04-06 19:29:30 +02:00
8 changed files with 249 additions and 196 deletions

1
.gitignore vendored
View File

@ -1,2 +1 @@
*.cfg
healthcheck/healthcheck-virtualenv

View File

@ -4,8 +4,6 @@ Every utility is in a folder with its relevant configuration and is completely s
## HEALTHCHECK
A simple server health check.
Allows to keep under control the machine vitals (cpu usage, raid status, thermals...) and alter the sysadmin in case of anomalies.
Sends an email and/or executes a command in case of alarm (high temperature, RAID disk failed etc...).
As an example, the command may be a ntfy call to obtain a notification on a mobile phone or desktop computer.
Meant to be run with a cron (see healthcheck.cron.example).

3
dashboard/README.md Normal file
View File

@ -0,0 +1,3 @@
# Dashboard
Allows using a tablet, smartphone, ebook reader or any other low-power internet-connected hardware as system monitor for an host on the same network.

View File

@ -0,0 +1,95 @@
[DEFAULT]
# The webpage will be available at http://this.host.ip.address:PORT
PORT=8080
# The webpage will be updated every REFRESH_SECONDS. Set to 0 to disable autorefresh.
REFRESH_SECONDS=1
#### SENSORS ####
# Every sensor value is obtained on a command being executed, its result being parsed with a regexp
# to extract (as a single group) the numeric or string value, and the value being used to plot the
# graph. This sensor definitions are ready to be used, just enable the ones you need.
# You can add your own declaring another section like this:
#
# [my_sensor]
# DISABLED=False
# COMMAND=/my/custom/binary --with parameters
# REGEXP=my regex to parse (awesome|disappointing) command output
# TYPE=TIMEGRAPH # May also be GRAPH or ERROR
[system_load_1min]
# The system load average in the last minute
DISABLED=True
COMMAND=uptime
REGEXP=.*load average: (\d+[,.]\d+), \d+[,.]\d+, \d+[,.]\d+
TYPE=TIMEGRAPH
[system_load_5min]
# The system load average in the last 5 minutes
DISABLED=True
COMMAND=uptime
REGEXP=.*load average: \d+[,.]\d+, (\d+[,.]\d+), \d+[,.]\d+
TYPE=TIMEGRAPH
[system_load_15min]
# The system load average in the last 15 minutes
DISABLED=True
COMMAND=uptime
REGEXP=.*load average: \d+[,.]\d+, \d+[,.]\d+, (\d+[,.]\d+)
TYPE=TIMEGRAPH
[used_disk_space]
# Used disk space in percent
DISABLED=True
COMMAND=df -h /dev/sda1
REGEXP=(\d{1,3})%
TYPE=GRAPH
[raid_status]
# Raid status
DISABLED=True
COMMAND=cat /proc/mdstat
REGEXP=.*\] \[([U_]+)\]\n
TYPE=ERROR
[laptop_charger_disconnected]
# Laptop charger disconnected
# For laptops used as servers, apparently common among the self hosters. Requires acpi package installed.
DISABLED=True
COMMAND=acpi -a
REGEXP=Adapter \d: (.+)
TYPE=ERROR
[free_ram]
# Free ram in %
# Shows another approach: does all the computation in the command and picks up
# all the output (by not declaring a regexp).
DISABLED=True
COMMAND=free | grep Mem | awk '{print int($4/$2 * 100.0)}'
TYPE=TIMEGRAPH
[available_ram]
# Like Free ram, but shows available instead of free. You may want to use this if you use a memcache.
DISABLED=True
COMMAND=free | grep Mem | awk '{print int($7/$2 * 100.0)}'
TYPE=TIMEGRAPH
[cpu_temperature]
# CPU Temperature alarm: requires lm-sensors installed and configured (check your distribution's guide)
# The regexp must be adapted to your configuration: run `sensors` in the command line
# to find the name of the temperature sensor in your system. In this case is `Core 0`,
# but may be called Tdie or a lot of different names, there is no standard.
DISABLED=True
COMMAND=sensors
REGEXP=Core 0: +\+?(-?\d{1,3}).\d°[CF]
TYPE=TIMEGRAPH
[fan_speed]
# Fan speed alarm: requires lm-sensors installed and configured (check your distribution's guide)
# The regexp must be adapted to your configuration: run `sensors` in the command line
# to find the name of the fan speed sensor in your system.
DISABLED=True
COMMAND=sensors
REGEXP=cpu_fan: +(\d) RPM
TYPE=TIMEGRAPH

117
dashboard/dashboard.py Executable file
View File

@ -0,0 +1,117 @@
#!/usr/bin/env python3
import os
import sys
import time
from http.server import BaseHTTPRequestHandler, HTTPServer
import logging
import traceback
import re
import locale
import subprocess
import configparser
""" @package docstring
Resources dashboard
Starts a webserver on a specific port of the monitored server and serves a simple webpage containing
the monitored sensors graphs.
Installation:
- Copy dashboard.cfg in /usr/local/etc/dashboard.cfg and customize it
- Copy dashboard.py in /usr/local/bin/dashboard.py
Usage:
Start the server:
/usr/local/bin/dashboard.py /usr/local/etc/dashboard.cfg
@author Daniele Verducci <daniele.verducci@ichibi.eu>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
NAME = 'dashboard'
VERSION = '0.1'
DESCRIPTION = 'A simple system resources dashboard'
class WebServer(BaseHTTPRequestHandler):
def __init__(self, configPath):
''' Sets up locale (needed for parsing numbers) '''
# Get system locale from $LANG (i.e. "en_GB.UTF-8")
systemLocale = os.getenv('LANG')
if not systemLocale:
raise ValueError('System environment variabile $LANG is not set!')
locale.setlocale(locale.LC_ALL, systemLocale)
''' Reads the config '''
self._log = logging.getLogger('main')
if not os.path.exists(configPath) or not os.path.isfile(configPath):
raise ValueError('configPath must be a file')
self.config = configparser.ConfigParser(interpolation=None) # Disable interpolation because contains regexp
self.config.read(configPath)
self.hostname = os.uname()[1]
def do_GET(self):
self.readSensors()
self.send_response(200)
self.send_header("Content-type", "text/html")
self.end_headers()
self.wfile.write(bytes("<html><head><title>https://pythonbasics.org</title></head>", "utf-8"))
self.wfile.write(bytes("<p>Request: %s</p>" % self.path, "utf-8"))
self.wfile.write(bytes("<body>", "utf-8"))
self.wfile.write(bytes("<p>This is an example web server.</p>", "utf-8"))
self.wfile.write(bytes("</body></html>", "utf-8"))
def readSensors():
return
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(
prog = NAME + '.py',
description = NAME + ' ' + VERSION + '\n' + DESCRIPTION,
formatter_class = argparse.RawTextHelpFormatter
)
parser.add_argument('configFile', help="configuration file path")
parser.add_argument('-q', '--quiet', action='store_true', help="suppress non-essential output")
args = parser.parse_args()
if args.quiet:
level = logging.WARNING
else:
level = logging.INFO
logging.basicConfig(level=level)
port =
httpd = HTTPServer(('localhost', port), Server)
logging.info('Serving on port {}'.format(port))
try:
httpd.serve_forever()
except KeyboardInterrupt:
pass
except Exception as e:
logging.critical(traceback.format_exc())
print('ERROR: {}'.format(e))
sys.exit(1)
finally:
httpd.server_close()
sys.exit(0)

View File

@ -1,7 +1,5 @@
# HEALTHCHECK
A simple server health check.
Allows to keep under control the machine vitals (cpu usage, raid status, thermals...) and alter the sysadmin in case of anomalies.
Sends an email and/or executes a command in case of alarm.
As an example, the command may be a ntfy call to obtain a notification on a mobile phone or desktop computer.
Meant to be run with a cron (see healthcheck.cron.example).
@ -42,10 +40,6 @@ Copy the script and the config file into the system to check:
cp healthcheck.py /usr/local/bin/healthcheck.py
cp healthcheck.cfg.example /usr/local/etc/healthcheck.cfg
```
Make the script executable:
```
chmod +x /usr/local/bin/healthcheck.py
```
Edit `/usr/local/etc/healthcheck.cfg` enabling the checks you need and configuring email settings.
Run `/usr/local/bin/healthcheck.py /usr/local/etc/healthcheck.cfg` to check it is working. If needed, change the config to make a check fail and see if the notification mail is delivered. If you need to do some testing without spamming emails, run with the parameter `--dry-run`.
Now copy the cron file:

View File

@ -1,5 +1,3 @@
# The DEFAULT section contains the global configuration applied to all checks.
# You can re-define this variables in a check to override the global one.
[DEFAULT]
#### EMAIL NOTIFICATIONS ####
@ -41,27 +39,10 @@ MAILTO=root@localhost, user@localhost
#ALARM_COMMAND=curl -H "%%CHECKNAME%% alarm on %%HOSTNAME%%" -d "%%ERROR%% on %%DATETIME%%" ntfy.sh/my-unique-topic-name
#### NOTIFICATION POLICY ###
# Defines when to send the email and/or execute ALARM_COMMAND. Useful to avoid email flooding.
# Possible values:
# EVERY_RUN In case of alarm, sends a mail every time the script is run
# START Sends a mail only when an alarm starts
# ONCE_IN_MINUTES In case of alarm, resends a mail only if NOTIFY_MINUTES has passed
NOTIFY=EVERY_RUN
# Used only if NOTIFY=ONCE_IN_MINUTES. A mail is sent only if NOTIFY_MINUTES has passed from the previous one
NOTIFY_MINUTES=60
# Sends a mail when the alarm has ended
NOTIFY_ALARM_END=TRUE
#### HEALTH CHECKS ####
# Every health check is based on a command being executed, its result being parsed with a regexp
# to extract (as a single group) the numeric or string value, and the value being compared with
# a configured value. This checks are ready to be used, just enable the ones you need.
#
# CUSTOM CHECKS:
# You can add your own custom check declaring another section like this:
#
# [my_custom_check_name]
@ -74,37 +55,28 @@ NOTIFY_ALARM_END=TRUE
# ALARM_VALUE_LESS_THAN=12
# COMMAND=/my/custom/binary --with parameters
# REGEXP=my regex to parse (awesome|disappointing) command output
#
# First test your custom command executing it in the command line
# Take the text output and write a regex to match it. Check every case:
# success result, error result, command failure. Then paste the command
# and regex in this config, enable the check and run to verify is working.
[system_load_1min]
# The system load average in the last minute
DISABLED=False
DISABLED=True
ALARM_VALUE_MORE_THAN=1.0
COMMAND=uptime
REGEXP=.*load average: (\d+[,.]\d+), \d+[,.]\d+, \d+[,.]\d+
[system_load_5min]
# The system load average in the last 5 minutes
DISABLED=False
DISABLED=True
ALARM_VALUE_MORE_THAN=1.0
COMMAND=uptime
REGEXP=.*load average: \d+[,.]\d+, (\d+[,.]\d+), \d+[,.]\d+
[system_load_15min]
# The system load average in the last 15 minutes
DISABLED=False
DISABLED=True
ALARM_VALUE_MORE_THAN=1.0
COMMAND=uptime
REGEXP=.*load average: \d+[,.]\d+, \d+[,.]\d+, (\d+[,.]\d+)
[used_disk_space]
# Used disk space (in percent, i.e. ALARM_VALUE_MORE_THAN=75 -> alarm if disk is more than 75% full)
DISABLED=True
@ -112,7 +84,6 @@ ALARM_VALUE_MORE_THAN=75
COMMAND=df -h /dev/sda1
REGEXP=(\d{1,3})%
[raid_status]
# Issues an alarm when the raid is corrupted
# Checks this part of the /proc/mdstat file:
@ -124,7 +95,6 @@ ALARM_STRING_NOT_EQUAL=UU
COMMAND=cat /proc/mdstat
REGEXP=.*\] \[([U_]+)\]\n
[battery_level]
# Issues an alarm when battery is discharging below a certain level (long blackout, pulled power cord...)
# For laptops used as servers, apparently common among the self hosters. Requires acpi package installed.
@ -134,7 +104,6 @@ COMMAND=acpi -b
REGEXP=Battery \d: .*, (\d{1,3})%
ALARM_VALUE_LESS_THAN=90
[laptop_charger_disconnected]
# Issues an alarm when laptop charger is disconnected
# For laptops used as servers, apparently common among the self hosters. Requires acpi package installed.
@ -143,13 +112,19 @@ COMMAND=acpi -a
REGEXP=Adapter \d: (.+)
ALARM_STRING_EQUAL=off-line
[available_ram]
# Shows available ram in %.
DISABLED=False
COMMAND=free | grep Mem | awk '{print int($7/$2 * 100.0)}'
[free_ram]
# Free ram in %
# Shows another approach: does all the computation in the command and picks up
# all the output (by not declaring a regexp).
DISABLED=True
COMMAND=free | grep Mem | awk '{print int($4/$2 * 100.0)}'
ALARM_VALUE_LESS_THAN=20
[available_ram]
# Like Free ram, but shows available instead of free. You may want to use this if you use a memcache.
DISABLED=True
COMMAND=free | grep Mem | awk '{print int($7/$2 * 100.0)}'
ALARM_VALUE_LESS_THAN=20
[cpu_temperature]
# CPU Temperature alarm: requires lm-sensors installed and configured (check your distribution's guide)
@ -161,7 +136,6 @@ ALARM_VALUE_MORE_THAN=80
COMMAND=sensors
REGEXP=Core 0: +\+?(-?\d{1,3}).\d°[CF]
[fan_speed]
# Fan speed alarm: requires lm-sensors installed and configured (check your distribution's guide)
# The regexp must be adapted to your configuration: run `sensors` in the command line
@ -170,38 +144,3 @@ DISABLED=True
ALARM_VALUE_LESS_THAN=300
COMMAND=sensors
REGEXP=cpu_fan: +(\d) RPM
[host_reachability]
# Check if a remote host is alive with Ping. You can replace the ip with a domain name (e.g. COMMAND=ping debian.org -c 1)
#
# Shows another approach: uses the return value to print a string. Leverages ping's ability to return different error codes:
# 0 = success
# 1 = the host is unreachable
# 2 = an error has occurred (and will be logged to stderr)
# We are throwing away stdout and replacing it with a custom text.
# If there is a different text (the stderr), something bad happened, and it will be reported in the mail.
DISABLED=True
ALARM_STRING_NOT_EQUAL=Online
COMMAND=ping 192.168.1.123 -c 1 > /dev/null && echo "Online" || echo "Offline"
[service_webserver]
# Check if a webserver is running on port 80. You can replace the ip with a domain name.
# You can check different services changing the port number. Some examples:
# 80 HTTP Webserver
# 443 HTTPS Webserver
# 21 FTP
# 22 SSH
# 5900 VNC (Linux remote desktop)
# 3389 RDP (Windows remote desktop)
DISABLED=True
ALARM_STRING_NOT_EQUAL=Online
COMMAND=nc -z -w 3 192.168.1.123 80 > /dev/null && echo "Online" || echo "Offline"
[dummy_always_alarm]
# A dummy check that is always in alarm. Useful for testing notifications.
DISABLED=True
ALARM_STRING_EQUAL=Core meltdown!
COMMAND=echo "Core meltdown!"

View File

@ -46,18 +46,13 @@ import socket
import getpass
import re
import locale
import json
NAME = 'healthcheck'
VERSION = '0.1'
DESCRIPTION = 'A simple server monitoring software'
EMAIL_START_SUBJECT_TPL = '\U0001F6A8 {}: {} ALARM!'
EMAIL_START_MESSAGE_TPL = 'Alarm for sensor {} on host {} on {}: {}'
EMAIL_END_SUBJECT_TPL = '\u2705 {}: {} OK'
EMAIL_END_MESSAGE_TPL = 'Alarm ceased for sensor {} on host {} on {}'
# Healthcheck saves the current status (alarms triggered, last run... in this file)
STATUS_FILE = '/tmp/healthcheck.tmp'
EMAIL_SUBJECT_TPL = 'Host {} failed health check for {}'
EMAIL_MESSAGE_TPL = 'Alarm for sensor {} on host {} on {}: {}'
class Main:
@ -83,10 +78,6 @@ class Main:
def run(self, dryRun):
''' Runs the health checks '''
# Load status
status = Status()
# Run checks based o the config
for section in self.config:
if section == 'DEFAULT':
continue
@ -94,7 +85,6 @@ class Main:
s = Settings(section, self.config)
if s.disabled:
self._log.info('Ignoring disabled check "{}"'.format(section))
status.unsetAlarm(section)
continue
self._log.info('Checking "{}"'.format(section))
@ -103,37 +93,11 @@ class Main:
if error:
# Alarm!
logging.warning('Alarm for {}: {}!'.format(section, error))
if self.shouldNotify(section, s, status):
status.setAlarm(section)
if not dryRun:
if s.mailto:
self.sendAlmStartMail(s, error)
self.sendMail(s, error)
if s.alarmCommand:
self.executeAlarmCommand(s, error)
elif status.getAlarmTriggeredTimestamp(section) is not None:
logging.info('Alarm ceased for {}: OK!'.format(section))
if s.notify_alarm_end:
self.sendAlmEndMail(s)
status.unsetAlarm(section)
# Save updated status
status.save()
def shouldNotify(self, section, settings, status):
almTriggeredTime = status.getAlarmTriggeredTimestamp(section)
# Notify if alarm just started
if almTriggeredTime is None:
return True
# Notify if NOTIFY=EVERY_RUN
if settings.notify == 'EVERY_RUN':
return True
# Notify if time elapsed
if settings.notify == 'ONCE_IN_MINUTES' and (time.time() - almTriggeredTime) > (settings.notify_minutes * 60):
return True
return False
# Calls the provided command, checks the value parsing it with the provided regexp
# and returns an error string, or null if the value is within its limits
@ -148,15 +112,12 @@ class Main:
stdout = ""
ret = subprocess.run(config.command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
if ret.stderr:
self._log.info('{} subprocess stderr:\n{}'.format(config.command, ret.stderr.decode()))
self._log.info('{} subprocess stderr:\n{}', config.command, ret.stderr.decode())
if ret.stdout:
stdout = ret.stdout.decode()
self._log.debug('{} subprocess stdout:\n{}'.format(config.command, stdout))
self._log.debug('{} subprocess stdout:\n{}', config.command, stdout)
if ret.returncode != 0:
return 'the command exited with error code {} {}'.format(
ret.returncode,
'and error message "{}"'.format(ret.stderr.decode().strip()) if ret.stderr else ''
)
return 'subprocess {} exited with error code {}'.format(config.command, ret.returncode)
# Parse result with regex
match = re.search(config.regexp, stdout, re.MULTILINE)
@ -182,34 +143,15 @@ class Main:
if config.alarm_value_less_than and locale.atof(detectedValue) < float(config.alarm_value_less_than):
return 'value is {}, but should be greater than {}'.format(locale.atof(detectedValue), config.alarm_value_less_than)
def sendAlmStartMail(self, s, error):
subject = EMAIL_START_SUBJECT_TPL.format(self.hostname, s.name)
body = EMAIL_START_MESSAGE_TPL.format(
s.name,
self.hostname,
time.strftime("%a, %d %b %Y %H:%M:%S"),
error
)
self.sendMail(s, subject, body)
def sendAlmEndMail(self, s):
subject = EMAIL_END_SUBJECT_TPL.format(self.hostname, s.name)
body = EMAIL_END_MESSAGE_TPL.format(
s.name,
self.hostname,
time.strftime("%a, %d %b %Y %H:%M:%S")
)
self.sendMail(s, subject, body)
def sendMail(self, s, subject, body):
def sendMail(self, s, error):
if s.smtphost:
logging.info("Sending email to %s via %s", s.mailto, s.smtphost)
logging.info("Sending alarm email to %s via %s", s.mailto, s.smtphost)
else:
logging.info("Sending email to %s using local smtp", s.mailto)
logging.info("Sending alarm email to %s using local smtp", s.mailto)
# Create main message
msg = MIMEMultipart()
msg['Subject'] = subject
msg['Subject'] = EMAIL_SUBJECT_TPL.format(self.hostname, s.name)
if s.mailfrom:
m_from = s.mailfrom
else:
@ -219,6 +161,12 @@ class Main:
msg.preamble = 'This is a multi-part message in MIME format.'
# Add base text
body = EMAIL_MESSAGE_TPL.format(
s.name,
self.hostname,
time.strftime("%a, %d %b %Y %H:%M:%S"),
error
)
txt = MIMEText(body)
msg.attach(txt)
@ -256,34 +204,6 @@ class Main:
self._log.error('subprocess {} exited with error code {}'.format(cmdToRun, ret.returncode))
class Status:
''' Represents the current status (alarms triggered, last run...) '''
def __init__(self):
try:
with open(STATUS_FILE, 'r') as openfile:
self.status = json.load(openfile)
except FileNotFoundError:
self.status = {
'lastRun': 0, # unix time in seconds
'alarms': {}, # key-value, alarmName : alarmTriggeredTimestamp
}
def save(self):
self.status['lastRun'] = time.time()
jo = json.dumps(self.status)
with open(STATUS_FILE, "w") as outfile:
outfile.write(jo)
def setAlarm(self, almName):
self.status['alarms'][almName] = time.time()
def unsetAlarm(self, almName):
self.status['alarms'].pop(almName, None)
def getAlarmTriggeredTimestamp(self, almName):
return self.status['alarms'].get(almName, None)
class Settings:
''' Represents settings for a check '''
@ -321,10 +241,6 @@ class Settings:
self.alarm_value_not_equal = self.getStr(name, 'ALARM_VALUE_NOT_EQUAL', None)
self.alarm_value_more_than = self.getStr(name, 'ALARM_VALUE_MORE_THAN', None)
self.alarm_value_less_than = self.getStr(name, 'ALARM_VALUE_LESS_THAN', None)
## Notification policy
self.notify = self.getEnum(name, 'NOTIFY', 'EVERY_RUN', ['EVERY_RUN', 'START', 'ONCE_IN_MINUTES'])
self.notify_minutes = self.getInt(name, 'NOTIFY_MINUTES', 0)
self.notify_alarm_end = self.getBoolean(name, 'NOTIFY_ALARM_END', True)
## Command to obtain the value for comparation
self.command = self.getStr(name, 'COMMAND', None)
## Regexp to extract value from command output (default to match full string)
@ -336,20 +252,12 @@ class Settings:
except configparser.NoOptionError:
return defaultValue
def getInt(self, name, key, defaultValue):
return int(self.getStr(name, key, defaultValue))
def getBoolean(self, name, key, defaultValue):
try:
return self.config.getboolean(name, key)
except configparser.NoOptionError:
return defaultValue
def getEnum(self, name, key, defaultValue, values):
val = self.getStr(name, key, defaultValue)
if not val in values:
raise ValueError("Invalid value {} for configuration {}: expected one of {}".format(val, key, ', '.join(values)))
return val
if __name__ == '__main__':