selfhost-utils/healthcheck/healthcheck.cfg.example

147 lines
5.2 KiB
Plaintext
Raw Permalink Normal View History

2022-04-01 00:02:23 +02:00
[DEFAULT]
2022-04-01 10:27:04 +02:00
#### EMAIL NOTIFICATIONS ####
2022-04-01 00:02:23 +02:00
# Notify this email address(es) in case of alarm, multiple addresses separated by commas
2022-04-01 10:27:04 +02:00
# Comment this if you don't want email to be sent (maybe because using ALARM_COMMAND below)
2022-04-01 00:02:23 +02:00
MAILTO=root@localhost, user@localhost
# Sender address
2022-04-01 10:27:04 +02:00
#MAILFROM=root@localhost
2022-04-01 00:02:23 +02:00
# Use a remote SMTP host (enable by removing comment)
#SMTPHOST=my.smtp.host:465
# SMTP credentials
#SMTPUSER=mysmtpuser
#SMTPPASS=mysmtppass
# Use SSL for SMTP
#SMTPSSL=True
2022-04-01 10:27:04 +02:00
#### RUN COMMAND IN CASE OF ALARM ####
# You can run a command or script when an alert is issued.
#
# In this example, `curl` is used to send a POST request to Ntfy (https://ntfy.sh/), a service
# that delivers push notifications to smartphones and desktop computers.
# If you want to use ntfy, just change the topic name with something unique (see documentation
# at https://ntfy.sh/docs/ ), uncomment the ALARM_COMMAND entry and you are ready to go.
# If you generate a lot of traffic, please consider hosting your own ntfy server.
#
# Otherwise, you can replace the curl command with anything you want, you can use the following
# placeholders to pass your command/script the details about the event:
# %%CHECKNAME%% The name of the check (the one between square brackets in this config)
# %%HOSTNAME%% The host name
# %%DATETIME%% The date and time of the event, in human readable format
# %%ERROR%% An human readable error description (the same used in the mail alert)
#ALARM_COMMAND=curl -H "%%CHECKNAME%% alarm on %%HOSTNAME%%" -d "%%ERROR%% on %%DATETIME%%" ntfy.sh/my-unique-topic-name
2022-04-01 00:02:23 +02:00
#### HEALTH CHECKS ####
# Every health check is based on a command being executed, its result being parsed with a regexp
# to extract (as a single group) the numeric or string value, and the value being compared with
# a configured value. This checks are ready to be used, just enable the ones you need.
# You can add your own custom check declaring another section like this:
#
# [my_custom_check_name]
# DISABLED=False
# ALARM_STRING_EQUAL=Lorem ipsum
# ALARM_STRING_NOT_EQUAL=The lazy fox
# ALARM_VALUE_EQUAL=99
# ALARM_VALUE_NOT_EQUAL=76.365338
# ALARM_VALUE_MORE_THAN=1.0
# ALARM_VALUE_LESS_THAN=12
# COMMAND=/my/custom/binary --with parameters
# REGEXP=my regex to parse (awesome|disappointing) command output
[system_load_1min]
# The system load average in the last minute
DISABLED=True
ALARM_VALUE_MORE_THAN=1.0
COMMAND=uptime
REGEXP=.*load average: (\d+[,.]\d+), \d+[,.]\d+, \d+[,.]\d+
[system_load_5min]
# The system load average in the last 5 minutes
DISABLED=True
ALARM_VALUE_MORE_THAN=1.0
COMMAND=uptime
REGEXP=.*load average: \d+[,.]\d+, (\d+[,.]\d+), \d+[,.]\d+
[system_load_15min]
# The system load average in the last 15 minutes
DISABLED=True
ALARM_VALUE_MORE_THAN=1.0
COMMAND=uptime
REGEXP=.*load average: \d+[,.]\d+, \d+[,.]\d+, (\d+[,.]\d+)
[used_disk_space]
# Used disk space (in percent, i.e. ALARM_VALUE_MORE_THAN=75 -> alarm if disk is more than 75% full)
DISABLED=True
ALARM_VALUE_MORE_THAN=75
COMMAND=df -h /dev/sda1
REGEXP=(\d{1,3})%
[raid_status]
# Issues an alarm when the raid is corrupted
# Checks this part of the /proc/mdstat file:
# 243553280 blocks super 1.2 [2/2] [UU]
# If the content of the last [ ] contains only U (without _), the raid array is healty
# Otherwise, [U_] or [_U] is displayed (may contain more U or _ if the array is more disks)
DISABLED=True
ALARM_STRING_NOT_EQUAL=UU
COMMAND=cat /proc/mdstat
REGEXP=.*\] \[([U_]+)\]\n
[battery_level]
# Issues an alarm when battery is discharging below a certain level (long blackout, pulled power cord...)
# For laptops used as servers, apparently common among the self hosters. Requires acpi package installed.
# Value is in %
DISABLED=True
COMMAND=acpi -b
REGEXP=Battery \d: .*, (\d{1,3})%
ALARM_VALUE_LESS_THAN=90
[laptop_charger_disconnected]
# Issues an alarm when laptop charger is disconnected
# For laptops used as servers, apparently common among the self hosters. Requires acpi package installed.
DISABLED=True
COMMAND=acpi -a
REGEXP=Adapter \d: (.+)
2022-04-01 10:27:04 +02:00
ALARM_STRING_EQUAL=off-line
2022-04-01 00:02:23 +02:00
[free_ram]
# Free ram in %
# Shows another approach: does all the computation in the command and picks up
# all the output (by not declaring a regexp).
DISABLED=True
COMMAND=free | grep Mem | awk '{print int($4/$2 * 100.0)}'
ALARM_VALUE_LESS_THAN=20
2022-04-01 00:34:50 +02:00
[available_ram]
# Like Free ram, but shows available instead of free. You may want to use this if you use a memcache.
DISABLED=True
2022-04-01 11:30:29 +02:00
COMMAND=free | grep Mem | awk '{print int($7/$2 * 100.0)}'
2022-04-01 00:34:50 +02:00
ALARM_VALUE_LESS_THAN=20
2022-04-01 00:02:23 +02:00
[cpu_temperature]
# CPU Temperature alarm: requires lm-sensors installed and configured (check your distribution's guide)
# The regexp must be adapted to your configuration: run `sensors` in the command line
# to find the name of the temperature sensor in your system. In this case is `Core 0`,
# but may be called Tdie or a lot of different names, there is no standard.
DISABLED=True
ALARM_VALUE_MORE_THAN=80
COMMAND=sensors
REGEXP=Core 0: +\+?(-?\d{1,3}).\d°[CF]
[fan_speed]
# Fan speed alarm: requires lm-sensors installed and configured (check your distribution's guide)
# The regexp must be adapted to your configuration: run `sensors` in the command line
# to find the name of the fan speed sensor in your system.
DISABLED=True
ALARM_VALUE_LESS_THAN=300
COMMAND=sensors
REGEXP=cpu_fan: +(\d) RPM