Compare commits
7 Commits
Author | SHA1 | Date | |
---|---|---|---|
b93b5eb958 | |||
780b2ac5b3 | |||
56c1e01856 | |||
cd146018de | |||
e24383728a | |||
7f6fa1d0fa | |||
af9cbbf393 |
1
.gitignore
vendored
1
.gitignore
vendored
@ -1 +1,2 @@
|
|||||||
*.cfg
|
*.cfg
|
||||||
|
healthcheck/healthcheck-virtualenv
|
||||||
|
@ -4,6 +4,8 @@ Every utility is in a folder with its relevant configuration and is completely s
|
|||||||
|
|
||||||
## HEALTHCHECK
|
## HEALTHCHECK
|
||||||
A simple server health check.
|
A simple server health check.
|
||||||
|
Allows to keep under control the machine vitals (cpu usage, raid status, thermals...) and alter the sysadmin in case of anomalies.
|
||||||
|
|
||||||
Sends an email and/or executes a command in case of alarm (high temperature, RAID disk failed etc...).
|
Sends an email and/or executes a command in case of alarm (high temperature, RAID disk failed etc...).
|
||||||
As an example, the command may be a ntfy call to obtain a notification on a mobile phone or desktop computer.
|
As an example, the command may be a ntfy call to obtain a notification on a mobile phone or desktop computer.
|
||||||
Meant to be run with a cron (see healthcheck.cron.example).
|
Meant to be run with a cron (see healthcheck.cron.example).
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
# HEALTHCHECK
|
# HEALTHCHECK
|
||||||
A simple server health check.
|
A simple server health check.
|
||||||
|
Allows to keep under control the machine vitals (cpu usage, raid status, thermals...) and alter the sysadmin in case of anomalies.
|
||||||
|
|
||||||
Sends an email and/or executes a command in case of alarm.
|
Sends an email and/or executes a command in case of alarm.
|
||||||
As an example, the command may be a ntfy call to obtain a notification on a mobile phone or desktop computer.
|
As an example, the command may be a ntfy call to obtain a notification on a mobile phone or desktop computer.
|
||||||
Meant to be run with a cron (see healthcheck.cron.example).
|
Meant to be run with a cron (see healthcheck.cron.example).
|
||||||
@ -40,6 +42,10 @@ Copy the script and the config file into the system to check:
|
|||||||
cp healthcheck.py /usr/local/bin/healthcheck.py
|
cp healthcheck.py /usr/local/bin/healthcheck.py
|
||||||
cp healthcheck.cfg.example /usr/local/etc/healthcheck.cfg
|
cp healthcheck.cfg.example /usr/local/etc/healthcheck.cfg
|
||||||
```
|
```
|
||||||
|
Make the script executable:
|
||||||
|
```
|
||||||
|
chmod +x /usr/local/bin/healthcheck.py
|
||||||
|
```
|
||||||
Edit `/usr/local/etc/healthcheck.cfg` enabling the checks you need and configuring email settings.
|
Edit `/usr/local/etc/healthcheck.cfg` enabling the checks you need and configuring email settings.
|
||||||
Run `/usr/local/bin/healthcheck.py /usr/local/etc/healthcheck.cfg` to check it is working. If needed, change the config to make a check fail and see if the notification mail is delivered. If you need to do some testing without spamming emails, run with the parameter `--dry-run`.
|
Run `/usr/local/bin/healthcheck.py /usr/local/etc/healthcheck.cfg` to check it is working. If needed, change the config to make a check fail and see if the notification mail is delivered. If you need to do some testing without spamming emails, run with the parameter `--dry-run`.
|
||||||
Now copy the cron file:
|
Now copy the cron file:
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
# The DEFAULT section contains the global configuration applied to all checks.
|
||||||
|
# You can re-define this variables in a check to override the global one.
|
||||||
[DEFAULT]
|
[DEFAULT]
|
||||||
|
|
||||||
#### EMAIL NOTIFICATIONS ####
|
#### EMAIL NOTIFICATIONS ####
|
||||||
@ -39,10 +41,27 @@ MAILTO=root@localhost, user@localhost
|
|||||||
#ALARM_COMMAND=curl -H "%%CHECKNAME%% alarm on %%HOSTNAME%%" -d "%%ERROR%% on %%DATETIME%%" ntfy.sh/my-unique-topic-name
|
#ALARM_COMMAND=curl -H "%%CHECKNAME%% alarm on %%HOSTNAME%%" -d "%%ERROR%% on %%DATETIME%%" ntfy.sh/my-unique-topic-name
|
||||||
|
|
||||||
|
|
||||||
|
#### NOTIFICATION POLICY ###
|
||||||
|
# Defines when to send the email and/or execute ALARM_COMMAND. Useful to avoid email flooding.
|
||||||
|
# Possible values:
|
||||||
|
# EVERY_RUN In case of alarm, sends a mail every time the script is run
|
||||||
|
# START Sends a mail only when an alarm starts
|
||||||
|
# ONCE_IN_MINUTES In case of alarm, resends a mail only if NOTIFY_MINUTES has passed
|
||||||
|
NOTIFY=EVERY_RUN
|
||||||
|
|
||||||
|
# Used only if NOTIFY=ONCE_IN_MINUTES. A mail is sent only if NOTIFY_MINUTES has passed from the previous one
|
||||||
|
NOTIFY_MINUTES=60
|
||||||
|
|
||||||
|
# Sends a mail when the alarm has ended
|
||||||
|
NOTIFY_ALARM_END=TRUE
|
||||||
|
|
||||||
|
|
||||||
#### HEALTH CHECKS ####
|
#### HEALTH CHECKS ####
|
||||||
# Every health check is based on a command being executed, its result being parsed with a regexp
|
# Every health check is based on a command being executed, its result being parsed with a regexp
|
||||||
# to extract (as a single group) the numeric or string value, and the value being compared with
|
# to extract (as a single group) the numeric or string value, and the value being compared with
|
||||||
# a configured value. This checks are ready to be used, just enable the ones you need.
|
# a configured value. This checks are ready to be used, just enable the ones you need.
|
||||||
|
#
|
||||||
|
# CUSTOM CHECKS:
|
||||||
# You can add your own custom check declaring another section like this:
|
# You can add your own custom check declaring another section like this:
|
||||||
#
|
#
|
||||||
# [my_custom_check_name]
|
# [my_custom_check_name]
|
||||||
@ -55,28 +74,37 @@ MAILTO=root@localhost, user@localhost
|
|||||||
# ALARM_VALUE_LESS_THAN=12
|
# ALARM_VALUE_LESS_THAN=12
|
||||||
# COMMAND=/my/custom/binary --with parameters
|
# COMMAND=/my/custom/binary --with parameters
|
||||||
# REGEXP=my regex to parse (awesome|disappointing) command output
|
# REGEXP=my regex to parse (awesome|disappointing) command output
|
||||||
|
#
|
||||||
|
# First test your custom command executing it in the command line
|
||||||
|
# Take the text output and write a regex to match it. Check every case:
|
||||||
|
# success result, error result, command failure. Then paste the command
|
||||||
|
# and regex in this config, enable the check and run to verify is working.
|
||||||
|
|
||||||
|
|
||||||
[system_load_1min]
|
[system_load_1min]
|
||||||
# The system load average in the last minute
|
# The system load average in the last minute
|
||||||
DISABLED=True
|
DISABLED=False
|
||||||
ALARM_VALUE_MORE_THAN=1.0
|
ALARM_VALUE_MORE_THAN=1.0
|
||||||
COMMAND=uptime
|
COMMAND=uptime
|
||||||
REGEXP=.*load average: (\d+[,.]\d+), \d+[,.]\d+, \d+[,.]\d+
|
REGEXP=.*load average: (\d+[,.]\d+), \d+[,.]\d+, \d+[,.]\d+
|
||||||
|
|
||||||
|
|
||||||
[system_load_5min]
|
[system_load_5min]
|
||||||
# The system load average in the last 5 minutes
|
# The system load average in the last 5 minutes
|
||||||
DISABLED=True
|
DISABLED=False
|
||||||
ALARM_VALUE_MORE_THAN=1.0
|
ALARM_VALUE_MORE_THAN=1.0
|
||||||
COMMAND=uptime
|
COMMAND=uptime
|
||||||
REGEXP=.*load average: \d+[,.]\d+, (\d+[,.]\d+), \d+[,.]\d+
|
REGEXP=.*load average: \d+[,.]\d+, (\d+[,.]\d+), \d+[,.]\d+
|
||||||
|
|
||||||
|
|
||||||
[system_load_15min]
|
[system_load_15min]
|
||||||
# The system load average in the last 15 minutes
|
# The system load average in the last 15 minutes
|
||||||
DISABLED=True
|
DISABLED=False
|
||||||
ALARM_VALUE_MORE_THAN=1.0
|
ALARM_VALUE_MORE_THAN=1.0
|
||||||
COMMAND=uptime
|
COMMAND=uptime
|
||||||
REGEXP=.*load average: \d+[,.]\d+, \d+[,.]\d+, (\d+[,.]\d+)
|
REGEXP=.*load average: \d+[,.]\d+, \d+[,.]\d+, (\d+[,.]\d+)
|
||||||
|
|
||||||
|
|
||||||
[used_disk_space]
|
[used_disk_space]
|
||||||
# Used disk space (in percent, i.e. ALARM_VALUE_MORE_THAN=75 -> alarm if disk is more than 75% full)
|
# Used disk space (in percent, i.e. ALARM_VALUE_MORE_THAN=75 -> alarm if disk is more than 75% full)
|
||||||
DISABLED=True
|
DISABLED=True
|
||||||
@ -84,6 +112,7 @@ ALARM_VALUE_MORE_THAN=75
|
|||||||
COMMAND=df -h /dev/sda1
|
COMMAND=df -h /dev/sda1
|
||||||
REGEXP=(\d{1,3})%
|
REGEXP=(\d{1,3})%
|
||||||
|
|
||||||
|
|
||||||
[raid_status]
|
[raid_status]
|
||||||
# Issues an alarm when the raid is corrupted
|
# Issues an alarm when the raid is corrupted
|
||||||
# Checks this part of the /proc/mdstat file:
|
# Checks this part of the /proc/mdstat file:
|
||||||
@ -95,6 +124,7 @@ ALARM_STRING_NOT_EQUAL=UU
|
|||||||
COMMAND=cat /proc/mdstat
|
COMMAND=cat /proc/mdstat
|
||||||
REGEXP=.*\] \[([U_]+)\]\n
|
REGEXP=.*\] \[([U_]+)\]\n
|
||||||
|
|
||||||
|
|
||||||
[battery_level]
|
[battery_level]
|
||||||
# Issues an alarm when battery is discharging below a certain level (long blackout, pulled power cord...)
|
# Issues an alarm when battery is discharging below a certain level (long blackout, pulled power cord...)
|
||||||
# For laptops used as servers, apparently common among the self hosters. Requires acpi package installed.
|
# For laptops used as servers, apparently common among the self hosters. Requires acpi package installed.
|
||||||
@ -104,6 +134,7 @@ COMMAND=acpi -b
|
|||||||
REGEXP=Battery \d: .*, (\d{1,3})%
|
REGEXP=Battery \d: .*, (\d{1,3})%
|
||||||
ALARM_VALUE_LESS_THAN=90
|
ALARM_VALUE_LESS_THAN=90
|
||||||
|
|
||||||
|
|
||||||
[laptop_charger_disconnected]
|
[laptop_charger_disconnected]
|
||||||
# Issues an alarm when laptop charger is disconnected
|
# Issues an alarm when laptop charger is disconnected
|
||||||
# For laptops used as servers, apparently common among the self hosters. Requires acpi package installed.
|
# For laptops used as servers, apparently common among the self hosters. Requires acpi package installed.
|
||||||
@ -112,20 +143,14 @@ COMMAND=acpi -a
|
|||||||
REGEXP=Adapter \d: (.+)
|
REGEXP=Adapter \d: (.+)
|
||||||
ALARM_STRING_EQUAL=off-line
|
ALARM_STRING_EQUAL=off-line
|
||||||
|
|
||||||
[free_ram]
|
|
||||||
# Free ram in %
|
|
||||||
# Shows another approach: does all the computation in the command and picks up
|
|
||||||
# all the output (by not declaring a regexp).
|
|
||||||
DISABLED=True
|
|
||||||
COMMAND=free | grep Mem | awk '{print int($4/$2 * 100.0)}'
|
|
||||||
ALARM_VALUE_LESS_THAN=20
|
|
||||||
|
|
||||||
[available_ram]
|
[available_ram]
|
||||||
# Like Free ram, but shows available instead of free. You may want to use this if you use a memcache.
|
# Shows available ram in %.
|
||||||
DISABLED=True
|
DISABLED=False
|
||||||
COMMAND=free | grep Mem | awk '{print int($7/$2 * 100.0)}'
|
COMMAND=free | grep Mem | awk '{print int($7/$2 * 100.0)}'
|
||||||
ALARM_VALUE_LESS_THAN=20
|
ALARM_VALUE_LESS_THAN=20
|
||||||
|
|
||||||
|
|
||||||
[cpu_temperature]
|
[cpu_temperature]
|
||||||
# CPU Temperature alarm: requires lm-sensors installed and configured (check your distribution's guide)
|
# CPU Temperature alarm: requires lm-sensors installed and configured (check your distribution's guide)
|
||||||
# The regexp must be adapted to your configuration: run `sensors` in the command line
|
# The regexp must be adapted to your configuration: run `sensors` in the command line
|
||||||
@ -136,6 +161,7 @@ ALARM_VALUE_MORE_THAN=80
|
|||||||
COMMAND=sensors
|
COMMAND=sensors
|
||||||
REGEXP=Core 0: +\+?(-?\d{1,3}).\d°[CF]
|
REGEXP=Core 0: +\+?(-?\d{1,3}).\d°[CF]
|
||||||
|
|
||||||
|
|
||||||
[fan_speed]
|
[fan_speed]
|
||||||
# Fan speed alarm: requires lm-sensors installed and configured (check your distribution's guide)
|
# Fan speed alarm: requires lm-sensors installed and configured (check your distribution's guide)
|
||||||
# The regexp must be adapted to your configuration: run `sensors` in the command line
|
# The regexp must be adapted to your configuration: run `sensors` in the command line
|
||||||
@ -144,3 +170,38 @@ DISABLED=True
|
|||||||
ALARM_VALUE_LESS_THAN=300
|
ALARM_VALUE_LESS_THAN=300
|
||||||
COMMAND=sensors
|
COMMAND=sensors
|
||||||
REGEXP=cpu_fan: +(\d) RPM
|
REGEXP=cpu_fan: +(\d) RPM
|
||||||
|
|
||||||
|
|
||||||
|
[host_reachability]
|
||||||
|
# Check if a remote host is alive with Ping. You can replace the ip with a domain name (e.g. COMMAND=ping debian.org -c 1)
|
||||||
|
#
|
||||||
|
# Shows another approach: uses the return value to print a string. Leverages ping's ability to return different error codes:
|
||||||
|
# 0 = success
|
||||||
|
# 1 = the host is unreachable
|
||||||
|
# 2 = an error has occurred (and will be logged to stderr)
|
||||||
|
# We are throwing away stdout and replacing it with a custom text.
|
||||||
|
# If there is a different text (the stderr), something bad happened, and it will be reported in the mail.
|
||||||
|
DISABLED=True
|
||||||
|
ALARM_STRING_NOT_EQUAL=Online
|
||||||
|
COMMAND=ping 192.168.1.123 -c 1 > /dev/null && echo "Online" || echo "Offline"
|
||||||
|
|
||||||
|
|
||||||
|
[service_webserver]
|
||||||
|
# Check if a webserver is running on port 80. You can replace the ip with a domain name.
|
||||||
|
# You can check different services changing the port number. Some examples:
|
||||||
|
# 80 HTTP Webserver
|
||||||
|
# 443 HTTPS Webserver
|
||||||
|
# 21 FTP
|
||||||
|
# 22 SSH
|
||||||
|
# 5900 VNC (Linux remote desktop)
|
||||||
|
# 3389 RDP (Windows remote desktop)
|
||||||
|
DISABLED=True
|
||||||
|
ALARM_STRING_NOT_EQUAL=Online
|
||||||
|
COMMAND=nc -z -w 3 192.168.1.123 80 > /dev/null && echo "Online" || echo "Offline"
|
||||||
|
|
||||||
|
|
||||||
|
[dummy_always_alarm]
|
||||||
|
# A dummy check that is always in alarm. Useful for testing notifications.
|
||||||
|
DISABLED=True
|
||||||
|
ALARM_STRING_EQUAL=Core meltdown!
|
||||||
|
COMMAND=echo "Core meltdown!"
|
||||||
|
@ -46,13 +46,18 @@ import socket
|
|||||||
import getpass
|
import getpass
|
||||||
import re
|
import re
|
||||||
import locale
|
import locale
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
NAME = 'healthcheck'
|
NAME = 'healthcheck'
|
||||||
VERSION = '0.1'
|
VERSION = '0.1'
|
||||||
DESCRIPTION = 'A simple server monitoring software'
|
DESCRIPTION = 'A simple server monitoring software'
|
||||||
EMAIL_SUBJECT_TPL = 'Host {} failed health check for {}'
|
EMAIL_START_SUBJECT_TPL = '\U0001F6A8 {}: {} ALARM!'
|
||||||
EMAIL_MESSAGE_TPL = 'Alarm for sensor {} on host {} on {}: {}'
|
EMAIL_START_MESSAGE_TPL = 'Alarm for sensor {} on host {} on {}: {}'
|
||||||
|
EMAIL_END_SUBJECT_TPL = '\u2705 {}: {} OK'
|
||||||
|
EMAIL_END_MESSAGE_TPL = 'Alarm ceased for sensor {} on host {} on {}'
|
||||||
|
# Healthcheck saves the current status (alarms triggered, last run... in this file)
|
||||||
|
STATUS_FILE = '/tmp/healthcheck.tmp'
|
||||||
|
|
||||||
class Main:
|
class Main:
|
||||||
|
|
||||||
@ -78,6 +83,10 @@ class Main:
|
|||||||
def run(self, dryRun):
|
def run(self, dryRun):
|
||||||
''' Runs the health checks '''
|
''' Runs the health checks '''
|
||||||
|
|
||||||
|
# Load status
|
||||||
|
status = Status()
|
||||||
|
|
||||||
|
# Run checks based o the config
|
||||||
for section in self.config:
|
for section in self.config:
|
||||||
if section == 'DEFAULT':
|
if section == 'DEFAULT':
|
||||||
continue
|
continue
|
||||||
@ -85,6 +94,7 @@ class Main:
|
|||||||
s = Settings(section, self.config)
|
s = Settings(section, self.config)
|
||||||
if s.disabled:
|
if s.disabled:
|
||||||
self._log.info('Ignoring disabled check "{}"'.format(section))
|
self._log.info('Ignoring disabled check "{}"'.format(section))
|
||||||
|
status.unsetAlarm(section)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
self._log.info('Checking "{}"'.format(section))
|
self._log.info('Checking "{}"'.format(section))
|
||||||
@ -93,11 +103,37 @@ class Main:
|
|||||||
if error:
|
if error:
|
||||||
# Alarm!
|
# Alarm!
|
||||||
logging.warning('Alarm for {}: {}!'.format(section, error))
|
logging.warning('Alarm for {}: {}!'.format(section, error))
|
||||||
if not dryRun:
|
if self.shouldNotify(section, s, status):
|
||||||
if s.mailto:
|
status.setAlarm(section)
|
||||||
self.sendMail(s, error)
|
if not dryRun:
|
||||||
if s.alarmCommand:
|
if s.mailto:
|
||||||
self.executeAlarmCommand(s, error)
|
self.sendAlmStartMail(s, error)
|
||||||
|
if s.alarmCommand:
|
||||||
|
self.executeAlarmCommand(s, error)
|
||||||
|
elif status.getAlarmTriggeredTimestamp(section) is not None:
|
||||||
|
logging.info('Alarm ceased for {}: OK!'.format(section))
|
||||||
|
if s.notify_alarm_end:
|
||||||
|
self.sendAlmEndMail(s)
|
||||||
|
status.unsetAlarm(section)
|
||||||
|
|
||||||
|
# Save updated status
|
||||||
|
status.save()
|
||||||
|
|
||||||
|
def shouldNotify(self, section, settings, status):
|
||||||
|
almTriggeredTime = status.getAlarmTriggeredTimestamp(section)
|
||||||
|
# Notify if alarm just started
|
||||||
|
if almTriggeredTime is None:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Notify if NOTIFY=EVERY_RUN
|
||||||
|
if settings.notify == 'EVERY_RUN':
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Notify if time elapsed
|
||||||
|
if settings.notify == 'ONCE_IN_MINUTES' and (time.time() - almTriggeredTime) > (settings.notify_minutes * 60):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
# Calls the provided command, checks the value parsing it with the provided regexp
|
# Calls the provided command, checks the value parsing it with the provided regexp
|
||||||
# and returns an error string, or null if the value is within its limits
|
# and returns an error string, or null if the value is within its limits
|
||||||
@ -112,12 +148,15 @@ class Main:
|
|||||||
stdout = ""
|
stdout = ""
|
||||||
ret = subprocess.run(config.command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
|
ret = subprocess.run(config.command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
|
||||||
if ret.stderr:
|
if ret.stderr:
|
||||||
self._log.info('{} subprocess stderr:\n{}', config.command, ret.stderr.decode())
|
self._log.info('{} subprocess stderr:\n{}'.format(config.command, ret.stderr.decode()))
|
||||||
if ret.stdout:
|
if ret.stdout:
|
||||||
stdout = ret.stdout.decode()
|
stdout = ret.stdout.decode()
|
||||||
self._log.debug('{} subprocess stdout:\n{}', config.command, stdout)
|
self._log.debug('{} subprocess stdout:\n{}'.format(config.command, stdout))
|
||||||
if ret.returncode != 0:
|
if ret.returncode != 0:
|
||||||
return 'subprocess {} exited with error code {}'.format(config.command, ret.returncode)
|
return 'the command exited with error code {} {}'.format(
|
||||||
|
ret.returncode,
|
||||||
|
'and error message "{}"'.format(ret.stderr.decode().strip()) if ret.stderr else ''
|
||||||
|
)
|
||||||
|
|
||||||
# Parse result with regex
|
# Parse result with regex
|
||||||
match = re.search(config.regexp, stdout, re.MULTILINE)
|
match = re.search(config.regexp, stdout, re.MULTILINE)
|
||||||
@ -143,15 +182,34 @@ class Main:
|
|||||||
if config.alarm_value_less_than and locale.atof(detectedValue) < float(config.alarm_value_less_than):
|
if config.alarm_value_less_than and locale.atof(detectedValue) < float(config.alarm_value_less_than):
|
||||||
return 'value is {}, but should be greater than {}'.format(locale.atof(detectedValue), config.alarm_value_less_than)
|
return 'value is {}, but should be greater than {}'.format(locale.atof(detectedValue), config.alarm_value_less_than)
|
||||||
|
|
||||||
def sendMail(self, s, error):
|
def sendAlmStartMail(self, s, error):
|
||||||
|
subject = EMAIL_START_SUBJECT_TPL.format(self.hostname, s.name)
|
||||||
|
body = EMAIL_START_MESSAGE_TPL.format(
|
||||||
|
s.name,
|
||||||
|
self.hostname,
|
||||||
|
time.strftime("%a, %d %b %Y %H:%M:%S"),
|
||||||
|
error
|
||||||
|
)
|
||||||
|
self.sendMail(s, subject, body)
|
||||||
|
|
||||||
|
def sendAlmEndMail(self, s):
|
||||||
|
subject = EMAIL_END_SUBJECT_TPL.format(self.hostname, s.name)
|
||||||
|
body = EMAIL_END_MESSAGE_TPL.format(
|
||||||
|
s.name,
|
||||||
|
self.hostname,
|
||||||
|
time.strftime("%a, %d %b %Y %H:%M:%S")
|
||||||
|
)
|
||||||
|
self.sendMail(s, subject, body)
|
||||||
|
|
||||||
|
def sendMail(self, s, subject, body):
|
||||||
if s.smtphost:
|
if s.smtphost:
|
||||||
logging.info("Sending alarm email to %s via %s", s.mailto, s.smtphost)
|
logging.info("Sending email to %s via %s", s.mailto, s.smtphost)
|
||||||
else:
|
else:
|
||||||
logging.info("Sending alarm email to %s using local smtp", s.mailto)
|
logging.info("Sending email to %s using local smtp", s.mailto)
|
||||||
|
|
||||||
# Create main message
|
# Create main message
|
||||||
msg = MIMEMultipart()
|
msg = MIMEMultipart()
|
||||||
msg['Subject'] = EMAIL_SUBJECT_TPL.format(self.hostname, s.name)
|
msg['Subject'] = subject
|
||||||
if s.mailfrom:
|
if s.mailfrom:
|
||||||
m_from = s.mailfrom
|
m_from = s.mailfrom
|
||||||
else:
|
else:
|
||||||
@ -161,12 +219,6 @@ class Main:
|
|||||||
msg.preamble = 'This is a multi-part message in MIME format.'
|
msg.preamble = 'This is a multi-part message in MIME format.'
|
||||||
|
|
||||||
# Add base text
|
# Add base text
|
||||||
body = EMAIL_MESSAGE_TPL.format(
|
|
||||||
s.name,
|
|
||||||
self.hostname,
|
|
||||||
time.strftime("%a, %d %b %Y %H:%M:%S"),
|
|
||||||
error
|
|
||||||
)
|
|
||||||
txt = MIMEText(body)
|
txt = MIMEText(body)
|
||||||
msg.attach(txt)
|
msg.attach(txt)
|
||||||
|
|
||||||
@ -204,6 +256,34 @@ class Main:
|
|||||||
self._log.error('subprocess {} exited with error code {}'.format(cmdToRun, ret.returncode))
|
self._log.error('subprocess {} exited with error code {}'.format(cmdToRun, ret.returncode))
|
||||||
|
|
||||||
|
|
||||||
|
class Status:
|
||||||
|
''' Represents the current status (alarms triggered, last run...) '''
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
try:
|
||||||
|
with open(STATUS_FILE, 'r') as openfile:
|
||||||
|
self.status = json.load(openfile)
|
||||||
|
except FileNotFoundError:
|
||||||
|
self.status = {
|
||||||
|
'lastRun': 0, # unix time in seconds
|
||||||
|
'alarms': {}, # key-value, alarmName : alarmTriggeredTimestamp
|
||||||
|
}
|
||||||
|
|
||||||
|
def save(self):
|
||||||
|
self.status['lastRun'] = time.time()
|
||||||
|
jo = json.dumps(self.status)
|
||||||
|
with open(STATUS_FILE, "w") as outfile:
|
||||||
|
outfile.write(jo)
|
||||||
|
|
||||||
|
def setAlarm(self, almName):
|
||||||
|
self.status['alarms'][almName] = time.time()
|
||||||
|
|
||||||
|
def unsetAlarm(self, almName):
|
||||||
|
self.status['alarms'].pop(almName, None)
|
||||||
|
|
||||||
|
def getAlarmTriggeredTimestamp(self, almName):
|
||||||
|
return self.status['alarms'].get(almName, None)
|
||||||
|
|
||||||
|
|
||||||
class Settings:
|
class Settings:
|
||||||
''' Represents settings for a check '''
|
''' Represents settings for a check '''
|
||||||
@ -241,6 +321,10 @@ class Settings:
|
|||||||
self.alarm_value_not_equal = self.getStr(name, 'ALARM_VALUE_NOT_EQUAL', None)
|
self.alarm_value_not_equal = self.getStr(name, 'ALARM_VALUE_NOT_EQUAL', None)
|
||||||
self.alarm_value_more_than = self.getStr(name, 'ALARM_VALUE_MORE_THAN', None)
|
self.alarm_value_more_than = self.getStr(name, 'ALARM_VALUE_MORE_THAN', None)
|
||||||
self.alarm_value_less_than = self.getStr(name, 'ALARM_VALUE_LESS_THAN', None)
|
self.alarm_value_less_than = self.getStr(name, 'ALARM_VALUE_LESS_THAN', None)
|
||||||
|
## Notification policy
|
||||||
|
self.notify = self.getEnum(name, 'NOTIFY', 'EVERY_RUN', ['EVERY_RUN', 'START', 'ONCE_IN_MINUTES'])
|
||||||
|
self.notify_minutes = self.getInt(name, 'NOTIFY_MINUTES', 0)
|
||||||
|
self.notify_alarm_end = self.getBoolean(name, 'NOTIFY_ALARM_END', True)
|
||||||
## Command to obtain the value for comparation
|
## Command to obtain the value for comparation
|
||||||
self.command = self.getStr(name, 'COMMAND', None)
|
self.command = self.getStr(name, 'COMMAND', None)
|
||||||
## Regexp to extract value from command output (default to match full string)
|
## Regexp to extract value from command output (default to match full string)
|
||||||
@ -252,12 +336,20 @@ class Settings:
|
|||||||
except configparser.NoOptionError:
|
except configparser.NoOptionError:
|
||||||
return defaultValue
|
return defaultValue
|
||||||
|
|
||||||
|
def getInt(self, name, key, defaultValue):
|
||||||
|
return int(self.getStr(name, key, defaultValue))
|
||||||
|
|
||||||
def getBoolean(self, name, key, defaultValue):
|
def getBoolean(self, name, key, defaultValue):
|
||||||
try:
|
try:
|
||||||
return self.config.getboolean(name, key)
|
return self.config.getboolean(name, key)
|
||||||
except configparser.NoOptionError:
|
except configparser.NoOptionError:
|
||||||
return defaultValue
|
return defaultValue
|
||||||
|
|
||||||
|
def getEnum(self, name, key, defaultValue, values):
|
||||||
|
val = self.getStr(name, key, defaultValue)
|
||||||
|
if not val in values:
|
||||||
|
raise ValueError("Invalid value {} for configuration {}: expected one of {}".format(val, key, ', '.join(values)))
|
||||||
|
return val
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
Reference in New Issue
Block a user