# The DEFAULT section contains the global configuration applied to all checks. # You can re-define this variables in a check to override the global one. [DEFAULT] #### EMAIL NOTIFICATIONS #### # Notify this email address(es) in case of alarm, multiple addresses separated by commas # Comment this if you don't want email to be sent (maybe because using ALARM_COMMAND below) MAILTO=root@localhost, user@localhost # Sender address #MAILFROM=root@localhost # Use a remote SMTP host (enable by removing comment) #SMTPHOST=my.smtp.host:465 # SMTP credentials #SMTPUSER=mysmtpuser #SMTPPASS=mysmtppass # Use SSL for SMTP #SMTPSSL=True #### RUN COMMAND IN CASE OF ALARM #### # You can run a command or script when an alert is issued. # # In this example, `curl` is used to send a POST request to Ntfy (https://ntfy.sh/), a service # that delivers push notifications to smartphones and desktop computers. # If you want to use ntfy, just change the topic name with something unique (see documentation # at https://ntfy.sh/docs/ ), uncomment the ALARM_COMMAND entry and you are ready to go. # If you generate a lot of traffic, please consider hosting your own ntfy server. # # Otherwise, you can replace the curl command with anything you want, you can use the following # placeholders to pass your command/script the details about the event: # %%CHECKNAME%% The name of the check (the one between square brackets in this config) # %%HOSTNAME%% The host name # %%DATETIME%% The date and time of the event, in human readable format # %%ERROR%% An human readable error description (the same used in the mail alert) #ALARM_COMMAND=curl -H "%%CHECKNAME%% alarm on %%HOSTNAME%%" -d "%%ERROR%% on %%DATETIME%%" ntfy.sh/my-unique-topic-name #### NOTIFICATION POLICY ### # Defines when to send the email and/or execute ALARM_COMMAND. Useful to avoid email flooding. # Possible values: # EVERY_RUN In case of alarm, sends a mail every time the script is run # START Sends a mail only when an alarm starts # ONCE_IN_MINUTES In case of alarm, resends a mail only if NOTIFY_MINUTES has passed NOTIFY=EVERY_RUN # Used only if NOTIFY=ONCE_IN_MINUTES. A mail is sent only if NOTIFY_MINUTES has passed from the previous one NOTIFY_MINUTES=60 # Sends a mail when the alarm has ended NOTIFY_ALARM_END=TRUE #### HEALTH CHECKS #### # Every health check is based on a command being executed, its result being parsed with a regexp # to extract (as a single group) the numeric or string value, and the value being compared with # a configured value. This checks are ready to be used, just enable the ones you need. # # CUSTOM CHECKS: # You can add your own custom check declaring another section like this: # # [my_custom_check_name] # DISABLED=False # ALARM_STRING_EQUAL=Lorem ipsum # ALARM_STRING_NOT_EQUAL=The lazy fox # ALARM_VALUE_EQUAL=99 # ALARM_VALUE_NOT_EQUAL=76.365338 # ALARM_VALUE_MORE_THAN=1.0 # ALARM_VALUE_LESS_THAN=12 # COMMAND=/my/custom/binary --with parameters # REGEXP=my regex to parse (awesome|disappointing) command output # # First test your custom command executing it in the command line # Take the text output and write a regex to match it. Check every case: # success result, error result, command failure. Then paste the command # and regex in this config, enable the check and run to verify is working. #### AVOID FLAPPING #### # For values slowly changing, like a server cabinet temperature, a lot of start/end alarm notifications # may be generated if the value is near the limit (e.g. alarm set to 30, temperature jumping between 30.1 # and 29.9). To avoid this, a "grace" delta can be set with: # DELTA=0.5 # In the previous example, this make the alarm fire at 30.5 and stop at 29.5. # If not specified, the delta is 0. [system_load_1min] # The system load average in the last minute DISABLED=False ALARM_VALUE_MORE_THAN=1.0 COMMAND=uptime REGEXP=.*load average: (\d+[,.]\d+), \d+[,.]\d+, \d+[,.]\d+ [system_load_5min] # The system load average in the last 5 minutes DISABLED=False ALARM_VALUE_MORE_THAN=1.0 COMMAND=uptime REGEXP=.*load average: \d+[,.]\d+, (\d+[,.]\d+), \d+[,.]\d+ [system_load_15min] # The system load average in the last 15 minutes DISABLED=False ALARM_VALUE_MORE_THAN=1.0 COMMAND=uptime REGEXP=.*load average: \d+[,.]\d+, \d+[,.]\d+, (\d+[,.]\d+) [used_disk_space] # Used disk space (in percent, i.e. ALARM_VALUE_MORE_THAN=75 -> alarm if disk is more than 75% full) DISABLED=True ALARM_VALUE_MORE_THAN=75 COMMAND=df -h /dev/sda1 REGEXP=(\d{1,3})% [raid_status] # Issues an alarm when the raid is corrupted # Checks this part of the /proc/mdstat file: # 243553280 blocks super 1.2 [2/2] [UU] # If the content of the last [ ] contains only U (without _), the raid array is healty # Otherwise, [U_] or [_U] is displayed (may contain more U or _ if the array is more disks) DISABLED=True ALARM_STRING_NOT_EQUAL=UU COMMAND=cat /proc/mdstat REGEXP=.*\] \[([U_]+)\]\n [battery_level] # Issues an alarm when battery is discharging below a certain level (long blackout, pulled power cord...) # For laptops used as servers, apparently common among the self hosters. Requires acpi package installed. # Value is in % DISABLED=True COMMAND=acpi -b REGEXP=Battery \d: .*, (\d{1,3})% ALARM_VALUE_LESS_THAN=90 [laptop_charger_disconnected] # Issues an alarm when laptop charger is disconnected # For laptops used as servers, apparently common among the self hosters. Requires acpi package installed. DISABLED=True COMMAND=acpi -a REGEXP=Adapter \d: (.+) ALARM_STRING_EQUAL=off-line [shutdown_on_battery_low] # For laptops used as a a server. Requires acpi package installed. # When the battery is low, shuts down cleanly the system instead of waiting for it # to shut down itself leaving all filesystems dirty. # ALARM_COMMAND is the command executed when this check fails. Shuts down the system in # 15 mins to allow for logging in and cancel the command. If you want to shut down # immediately, replace the ALARM_COMMAND with "shutdown now". # To cancel the shutdown, log in and "shutdown -c". DISABLED=True COMMAND=acpi -b REGEXP=REGEXP=Battery \d: Discharging, (\d{1,3})% ALARM_VALUE_LESS_THAN=50 ALARM_COMMAND=shutdown +15 "Shutdown in 15 mins due to battery low!" [available_ram] # Shows available ram in %. DISABLED=False COMMAND=free | grep Mem | awk '{print int($7/$2 * 100.0)}' ALARM_VALUE_LESS_THAN=20 [cpu_temperature] # CPU Temperature alarm: requires lm-sensors installed and configured (check your distribution's guide) # The regexp must be adapted to your configuration: run `sensors` in the command line # to find the name of the temperature sensor in your system. In this case is `Core 0`, # but may be called Tdie or a lot of different names, there is no standard. DISABLED=True ALARM_VALUE_MORE_THAN=80 COMMAND=sensors REGEXP=Core 0: +\+?(-?\d{1,3}).\d°[CF] [fan_speed] # Fan speed alarm: requires lm-sensors installed and configured (check your distribution's guide) # The regexp must be adapted to your configuration: run `sensors` in the command line # to find the name of the fan speed sensor in your system. DISABLED=True ALARM_VALUE_LESS_THAN=300 COMMAND=sensors REGEXP=cpu_fan: +(\d+) RPM [host_reachability] # Check if a remote host is alive with Ping. You can replace the ip with a domain name (e.g. COMMAND=ping debian.org -c 1) # # Shows another approach: uses the return value to print a string. Leverages ping's ability to return different error codes: # 0 = success # 1 = the host is unreachable # 2 = an error has occurred (and will be logged to stderr) # We are throwing away stdout and replacing it with a custom text. # If there is a different text (the stderr), something bad happened, and it will be reported in the mail. DISABLED=True ALARM_STRING_NOT_EQUAL=Online COMMAND=ping 192.168.1.123 -c 1 > /dev/null && echo "Online" || echo "Offline" [service_webserver] # Check if a webserver is running on port 80. You can replace the ip with a domain name. # You can check different services changing the port number. Some examples: # 80 HTTP Webserver # 443 HTTPS Webserver # 21 FTP # 22 SSH # 5900 VNC (Linux remote desktop) # 3389 RDP (Windows remote desktop) DISABLED=True ALARM_STRING_NOT_EQUAL=Online COMMAND=nc -z -w 3 192.168.1.123 80 > /dev/null && echo "Online" || echo "Offline" [dummy_always_alarm] # A dummy check that is always in alarm. Useful for testing notifications. DISABLED=True ALARM_STRING_EQUAL=Core meltdown! COMMAND=echo "Core meltdown!" [security_updates_available] # Checks for security updates via apt (works on Debian and derivatives, like Ubuntu). # Needs the repositories to be updated with `apt update`, but is an heavy command, so it may # be configured to be executed daily in a command in the same cron of healthcheck. # E.g.: place this string in /etc/cron.d/healthcheck, before the healthcheck command: # 1 1 * * * root apt update DISABLED=True ALARM_STRING_EQUAL=security updates available REGEXP=(security updates available|NO security updates available) COMMAND=apt list --upgradable 2>/dev/null | grep -e "-security" && echo "security updates available" || echo "NO security updates available" NOTIFY=START [ups_power] # Raises an alarm when UPS runs on battery. # Requires NUT installed and configured on the system # See complete documentation and support lists: https://networkupstools.org # See simple start-up guide for Debian: https://wiki.debian.org/nut # This config is for usbhid-ups driver. If you use a different driver, you may need # to change the REGEXP to fit your output. DISABLED=True ALARM_STRING_NOT_EQUAL=OL COMMAND=upsc eaton1600 2> /dev/null REGEXP=^ups\.status: (OL|OB)$ NOTIFY=START [ups_battery] # Raises an alarm when UPS battery is discharged below 50%. # Requires NUT installed and configured on the system # See complete documentation and support lists: https://networkupstools.org # See simple start-up guide for Debian: https://wiki.debian.org/nut # This config is for usbhid-ups driver. If you use a different driver, you may need # to change the REGEXP to fit your output. DISABLED=True ALARM_VALUE_LESS_THAN=50 COMMAND=upsc eaton1600 2> /dev/null REGEXP=^battery\.charge: (\d{1,3})$ NOTIFY=ONCE_IN_MINUTES NOTIFY_MINUTES=15