2022-04-15 08:23:40 +02:00
|
|
|
# The DEFAULT section contains the global configuration applied to all checks.
|
|
|
|
# You can re-define this variables in a check to override the global one.
|
2022-04-01 00:02:23 +02:00
|
|
|
[DEFAULT]
|
|
|
|
|
2022-04-01 10:27:04 +02:00
|
|
|
#### EMAIL NOTIFICATIONS ####
|
2022-04-01 00:02:23 +02:00
|
|
|
|
|
|
|
# Notify this email address(es) in case of alarm, multiple addresses separated by commas
|
2022-04-01 10:27:04 +02:00
|
|
|
# Comment this if you don't want email to be sent (maybe because using ALARM_COMMAND below)
|
2022-04-01 00:02:23 +02:00
|
|
|
MAILTO=root@localhost, user@localhost
|
|
|
|
|
|
|
|
# Sender address
|
2022-04-01 10:27:04 +02:00
|
|
|
#MAILFROM=root@localhost
|
2022-04-01 00:02:23 +02:00
|
|
|
|
|
|
|
# Use a remote SMTP host (enable by removing comment)
|
|
|
|
#SMTPHOST=my.smtp.host:465
|
|
|
|
|
|
|
|
# SMTP credentials
|
|
|
|
#SMTPUSER=mysmtpuser
|
|
|
|
#SMTPPASS=mysmtppass
|
|
|
|
|
|
|
|
# Use SSL for SMTP
|
|
|
|
#SMTPSSL=True
|
|
|
|
|
|
|
|
|
2022-04-01 10:27:04 +02:00
|
|
|
#### RUN COMMAND IN CASE OF ALARM ####
|
|
|
|
# You can run a command or script when an alert is issued.
|
|
|
|
#
|
|
|
|
# In this example, `curl` is used to send a POST request to Ntfy (https://ntfy.sh/), a service
|
|
|
|
# that delivers push notifications to smartphones and desktop computers.
|
|
|
|
# If you want to use ntfy, just change the topic name with something unique (see documentation
|
|
|
|
# at https://ntfy.sh/docs/ ), uncomment the ALARM_COMMAND entry and you are ready to go.
|
|
|
|
# If you generate a lot of traffic, please consider hosting your own ntfy server.
|
|
|
|
#
|
|
|
|
# Otherwise, you can replace the curl command with anything you want, you can use the following
|
|
|
|
# placeholders to pass your command/script the details about the event:
|
|
|
|
# %%CHECKNAME%% The name of the check (the one between square brackets in this config)
|
|
|
|
# %%HOSTNAME%% The host name
|
|
|
|
# %%DATETIME%% The date and time of the event, in human readable format
|
|
|
|
# %%ERROR%% An human readable error description (the same used in the mail alert)
|
|
|
|
|
|
|
|
#ALARM_COMMAND=curl -H "%%CHECKNAME%% alarm on %%HOSTNAME%%" -d "%%ERROR%% on %%DATETIME%%" ntfy.sh/my-unique-topic-name
|
|
|
|
|
2022-04-01 00:02:23 +02:00
|
|
|
|
2022-04-15 08:23:40 +02:00
|
|
|
#### NOTIFICATION POLICY ###
|
|
|
|
# Defines when to send the email and/or execute ALARM_COMMAND. Useful to avoid email flooding.
|
|
|
|
# Possible values:
|
|
|
|
# EVERY_RUN In case of alarm, sends a mail every time the script is run
|
|
|
|
# START Sends a mail only when an alarm starts
|
|
|
|
# ONCE_IN_MINUTES In case of alarm, resends a mail only if NOTIFY_MINUTES has passed
|
|
|
|
NOTIFY=EVERY_RUN
|
|
|
|
|
|
|
|
# Used only if NOTIFY=ONCE_IN_MINUTES. A mail is sent only if NOTIFY_MINUTES has passed from the previous one
|
|
|
|
NOTIFY_MINUTES=60
|
|
|
|
|
|
|
|
# Sends a mail when the alarm has ended
|
|
|
|
NOTIFY_ALARM_END=TRUE
|
|
|
|
|
|
|
|
|
2022-04-01 00:02:23 +02:00
|
|
|
#### HEALTH CHECKS ####
|
|
|
|
# Every health check is based on a command being executed, its result being parsed with a regexp
|
|
|
|
# to extract (as a single group) the numeric or string value, and the value being compared with
|
|
|
|
# a configured value. This checks are ready to be used, just enable the ones you need.
|
2022-04-07 08:50:54 +02:00
|
|
|
#
|
|
|
|
# CUSTOM CHECKS:
|
2022-04-01 00:02:23 +02:00
|
|
|
# You can add your own custom check declaring another section like this:
|
|
|
|
#
|
|
|
|
# [my_custom_check_name]
|
|
|
|
# DISABLED=False
|
|
|
|
# ALARM_STRING_EQUAL=Lorem ipsum
|
|
|
|
# ALARM_STRING_NOT_EQUAL=The lazy fox
|
|
|
|
# ALARM_VALUE_EQUAL=99
|
|
|
|
# ALARM_VALUE_NOT_EQUAL=76.365338
|
|
|
|
# ALARM_VALUE_MORE_THAN=1.0
|
|
|
|
# ALARM_VALUE_LESS_THAN=12
|
|
|
|
# COMMAND=/my/custom/binary --with parameters
|
|
|
|
# REGEXP=my regex to parse (awesome|disappointing) command output
|
2022-04-07 08:50:54 +02:00
|
|
|
#
|
|
|
|
# First test your custom command executing it in the command line
|
|
|
|
# Take the text output and write a regex to match it. Check every case:
|
|
|
|
# success result, error result, command failure. Then paste the command
|
|
|
|
# and regex in this config, enable the check and run to verify is working.
|
|
|
|
|
2022-04-01 00:02:23 +02:00
|
|
|
|
|
|
|
[system_load_1min]
|
|
|
|
# The system load average in the last minute
|
2022-04-16 00:15:49 +02:00
|
|
|
DISABLED=False
|
2022-04-01 00:02:23 +02:00
|
|
|
ALARM_VALUE_MORE_THAN=1.0
|
|
|
|
COMMAND=uptime
|
|
|
|
REGEXP=.*load average: (\d+[,.]\d+), \d+[,.]\d+, \d+[,.]\d+
|
|
|
|
|
2022-04-07 08:50:54 +02:00
|
|
|
|
2022-04-01 00:02:23 +02:00
|
|
|
[system_load_5min]
|
|
|
|
# The system load average in the last 5 minutes
|
2022-04-16 00:15:49 +02:00
|
|
|
DISABLED=False
|
2022-04-01 00:02:23 +02:00
|
|
|
ALARM_VALUE_MORE_THAN=1.0
|
|
|
|
COMMAND=uptime
|
|
|
|
REGEXP=.*load average: \d+[,.]\d+, (\d+[,.]\d+), \d+[,.]\d+
|
|
|
|
|
2022-04-07 08:50:54 +02:00
|
|
|
|
2022-04-01 00:02:23 +02:00
|
|
|
[system_load_15min]
|
|
|
|
# The system load average in the last 15 minutes
|
2022-04-16 00:15:49 +02:00
|
|
|
DISABLED=False
|
2022-04-01 00:02:23 +02:00
|
|
|
ALARM_VALUE_MORE_THAN=1.0
|
|
|
|
COMMAND=uptime
|
|
|
|
REGEXP=.*load average: \d+[,.]\d+, \d+[,.]\d+, (\d+[,.]\d+)
|
|
|
|
|
2022-04-07 08:50:54 +02:00
|
|
|
|
2022-04-01 00:02:23 +02:00
|
|
|
[used_disk_space]
|
|
|
|
# Used disk space (in percent, i.e. ALARM_VALUE_MORE_THAN=75 -> alarm if disk is more than 75% full)
|
|
|
|
DISABLED=True
|
|
|
|
ALARM_VALUE_MORE_THAN=75
|
|
|
|
COMMAND=df -h /dev/sda1
|
|
|
|
REGEXP=(\d{1,3})%
|
|
|
|
|
2022-04-07 08:50:54 +02:00
|
|
|
|
2022-04-01 00:02:23 +02:00
|
|
|
[raid_status]
|
|
|
|
# Issues an alarm when the raid is corrupted
|
|
|
|
# Checks this part of the /proc/mdstat file:
|
|
|
|
# 243553280 blocks super 1.2 [2/2] [UU]
|
|
|
|
# If the content of the last [ ] contains only U (without _), the raid array is healty
|
|
|
|
# Otherwise, [U_] or [_U] is displayed (may contain more U or _ if the array is more disks)
|
|
|
|
DISABLED=True
|
|
|
|
ALARM_STRING_NOT_EQUAL=UU
|
|
|
|
COMMAND=cat /proc/mdstat
|
|
|
|
REGEXP=.*\] \[([U_]+)\]\n
|
|
|
|
|
2022-04-07 08:50:54 +02:00
|
|
|
|
2022-04-01 00:02:23 +02:00
|
|
|
[battery_level]
|
|
|
|
# Issues an alarm when battery is discharging below a certain level (long blackout, pulled power cord...)
|
|
|
|
# For laptops used as servers, apparently common among the self hosters. Requires acpi package installed.
|
|
|
|
# Value is in %
|
|
|
|
DISABLED=True
|
|
|
|
COMMAND=acpi -b
|
|
|
|
REGEXP=Battery \d: .*, (\d{1,3})%
|
|
|
|
ALARM_VALUE_LESS_THAN=90
|
|
|
|
|
2022-04-07 08:50:54 +02:00
|
|
|
|
2022-04-01 00:02:23 +02:00
|
|
|
[laptop_charger_disconnected]
|
|
|
|
# Issues an alarm when laptop charger is disconnected
|
|
|
|
# For laptops used as servers, apparently common among the self hosters. Requires acpi package installed.
|
|
|
|
DISABLED=True
|
|
|
|
COMMAND=acpi -a
|
|
|
|
REGEXP=Adapter \d: (.+)
|
2022-04-01 10:27:04 +02:00
|
|
|
ALARM_STRING_EQUAL=off-line
|
2022-04-01 00:02:23 +02:00
|
|
|
|
2022-04-07 08:50:54 +02:00
|
|
|
|
2023-04-26 08:34:44 +02:00
|
|
|
[shutdown_on_battery_low]
|
|
|
|
# For laptops used as a a server. Requires acpi package installed.
|
|
|
|
# When the battery is low, shuts down cleanly the system instead of waiting for it
|
|
|
|
# to shut down itself leaving all filesystems dirty.
|
|
|
|
# ALARM_COMMAND is the command executed when this check fails. Shuts down the system in
|
|
|
|
# 15 mins to allow for logging in and cancel the command. If you want to shut down
|
|
|
|
# immediately, replace the ALARM_COMMAND with "shutdown now".
|
|
|
|
# To cancel the shutdown, log in and "shutdown -c".
|
|
|
|
DISABLED=True
|
|
|
|
COMMAND=acpi -b
|
2023-04-27 08:10:27 +02:00
|
|
|
REGEXP=REGEXP=Battery \d: Discharging, (\d{1,3})%
|
2023-04-26 08:34:44 +02:00
|
|
|
ALARM_VALUE_LESS_THAN=50
|
|
|
|
ALARM_COMMAND=shutdown +15 "Shutdown in 15 mins due to battery low!"
|
|
|
|
|
|
|
|
|
2022-04-01 00:34:50 +02:00
|
|
|
[available_ram]
|
2022-04-16 00:15:49 +02:00
|
|
|
# Shows available ram in %.
|
|
|
|
DISABLED=False
|
2022-04-01 11:30:29 +02:00
|
|
|
COMMAND=free | grep Mem | awk '{print int($7/$2 * 100.0)}'
|
2022-04-01 00:34:50 +02:00
|
|
|
ALARM_VALUE_LESS_THAN=20
|
|
|
|
|
2022-04-07 08:50:54 +02:00
|
|
|
|
2022-04-01 00:02:23 +02:00
|
|
|
[cpu_temperature]
|
|
|
|
# CPU Temperature alarm: requires lm-sensors installed and configured (check your distribution's guide)
|
|
|
|
# The regexp must be adapted to your configuration: run `sensors` in the command line
|
|
|
|
# to find the name of the temperature sensor in your system. In this case is `Core 0`,
|
|
|
|
# but may be called Tdie or a lot of different names, there is no standard.
|
|
|
|
DISABLED=True
|
|
|
|
ALARM_VALUE_MORE_THAN=80
|
|
|
|
COMMAND=sensors
|
|
|
|
REGEXP=Core 0: +\+?(-?\d{1,3}).\d°[CF]
|
|
|
|
|
2022-04-07 08:50:54 +02:00
|
|
|
|
2022-04-01 00:02:23 +02:00
|
|
|
[fan_speed]
|
|
|
|
# Fan speed alarm: requires lm-sensors installed and configured (check your distribution's guide)
|
|
|
|
# The regexp must be adapted to your configuration: run `sensors` in the command line
|
|
|
|
# to find the name of the fan speed sensor in your system.
|
|
|
|
DISABLED=True
|
|
|
|
ALARM_VALUE_LESS_THAN=300
|
|
|
|
COMMAND=sensors
|
2022-10-09 12:45:49 +02:00
|
|
|
REGEXP=cpu_fan: +(\d+) RPM
|
2022-04-07 08:50:54 +02:00
|
|
|
|
|
|
|
|
|
|
|
[host_reachability]
|
|
|
|
# Check if a remote host is alive with Ping. You can replace the ip with a domain name (e.g. COMMAND=ping debian.org -c 1)
|
|
|
|
#
|
|
|
|
# Shows another approach: uses the return value to print a string. Leverages ping's ability to return different error codes:
|
|
|
|
# 0 = success
|
|
|
|
# 1 = the host is unreachable
|
|
|
|
# 2 = an error has occurred (and will be logged to stderr)
|
|
|
|
# We are throwing away stdout and replacing it with a custom text.
|
|
|
|
# If there is a different text (the stderr), something bad happened, and it will be reported in the mail.
|
|
|
|
DISABLED=True
|
|
|
|
ALARM_STRING_NOT_EQUAL=Online
|
|
|
|
COMMAND=ping 192.168.1.123 -c 1 > /dev/null && echo "Online" || echo "Offline"
|
|
|
|
|
|
|
|
|
|
|
|
[service_webserver]
|
|
|
|
# Check if a webserver is running on port 80. You can replace the ip with a domain name.
|
|
|
|
# You can check different services changing the port number. Some examples:
|
|
|
|
# 80 HTTP Webserver
|
|
|
|
# 443 HTTPS Webserver
|
|
|
|
# 21 FTP
|
|
|
|
# 22 SSH
|
|
|
|
# 5900 VNC (Linux remote desktop)
|
|
|
|
# 3389 RDP (Windows remote desktop)
|
|
|
|
DISABLED=True
|
|
|
|
ALARM_STRING_NOT_EQUAL=Online
|
|
|
|
COMMAND=nc -z -w 3 192.168.1.123 80 > /dev/null && echo "Online" || echo "Offline"
|
2022-04-15 08:23:40 +02:00
|
|
|
|
|
|
|
|
|
|
|
[dummy_always_alarm]
|
|
|
|
# A dummy check that is always in alarm. Useful for testing notifications.
|
|
|
|
DISABLED=True
|
2022-04-15 08:59:10 +02:00
|
|
|
ALARM_STRING_EQUAL=Core meltdown!
|
2022-04-15 08:23:40 +02:00
|
|
|
COMMAND=echo "Core meltdown!"
|
2022-05-06 08:57:41 +02:00
|
|
|
|
|
|
|
|
|
|
|
[security_updates_available]
|
|
|
|
# Checks for security updates via apt (works on Debian and derivatives, like Ubuntu).
|
|
|
|
# Needs the repositories to be updated with `apt update`, but is an heavy command, so it may
|
|
|
|
# be configured to be executed daily in a command in the same cron of healthcheck.
|
|
|
|
# E.g.: place this string in /etc/cron.d/healthcheck, before the healthcheck command:
|
|
|
|
# 1 1 * * * root apt update
|
|
|
|
DISABLED=True
|
|
|
|
ALARM_STRING_EQUAL=security updates available
|
2022-05-28 12:41:04 +02:00
|
|
|
REGEXP=(security updates available|NO security updates available)
|
2022-05-06 08:57:41 +02:00
|
|
|
COMMAND=apt list --upgradable 2>/dev/null | grep -e "-security" && echo "security updates available" || echo "NO security updates available"
|
2022-05-28 12:41:04 +02:00
|
|
|
NOTIFY=START
|
2022-10-06 09:37:12 +02:00
|
|
|
|
|
|
|
|
|
|
|
[ups_power]
|
|
|
|
# Raises an alarm when UPS runs on battery.
|
|
|
|
# Requires NUT installed and configured on the system
|
|
|
|
# See complete documentation and support lists: https://networkupstools.org
|
|
|
|
# See simple start-up guide for Debian: https://wiki.debian.org/nut
|
|
|
|
# This config is for usbhid-ups driver. If you use a different driver, you may need
|
|
|
|
# to change the REGEXP to fit your output.
|
|
|
|
DISABLED=True
|
|
|
|
ALARM_STRING_NOT_EQUAL=OL
|
|
|
|
COMMAND=upsc eaton1600 2> /dev/null
|
|
|
|
REGEXP=^ups\.status: (OL|OB)$
|
|
|
|
NOTIFY=START
|
|
|
|
|
|
|
|
[ups_battery]
|
|
|
|
# Raises an alarm when UPS battery is discharged below 50%.
|
|
|
|
# Requires NUT installed and configured on the system
|
|
|
|
# See complete documentation and support lists: https://networkupstools.org
|
|
|
|
# See simple start-up guide for Debian: https://wiki.debian.org/nut
|
|
|
|
# This config is for usbhid-ups driver. If you use a different driver, you may need
|
|
|
|
# to change the REGEXP to fit your output.
|
|
|
|
DISABLED=True
|
|
|
|
ALARM_VALUE_LESS_THAN=50
|
|
|
|
COMMAND=upsc eaton1600 2> /dev/null
|
|
|
|
REGEXP=^battery\.charge: (\d{1,3})$
|
|
|
|
NOTIFY=ONCE_IN_MINUTES
|
|
|
|
NOTIFY_MINUTES=15
|