1
0
Fork 0

nagios/check_swaping: adding check on oom_kill + change of unit

In order to standardize units, we use bytes instead of page/pagesize.
This commit is contained in:
Chl 2021-09-17 00:22:13 +02:00
parent 91a513481e
commit 68ec8f8812

View file

@ -11,11 +11,13 @@
# Default values # Default values
THRESHOLD_WARNING="50" PAGESIZE=4096
THRESHOLD_CRITICAL="200" THRESHOLD_WARNING="200000"
THRESHOLD_CRITICAL="1000000"
THRESHOLD_OOMKILL="0"
DISABLE_OOMKILL=""
VMSTAT_PREVIOUS_DATA_FILE="/tmp/.monitoring_vmstat.txt" VMSTAT_PREVIOUS_DATA_FILE="/tmp/.monitoring_vmstat.txt"
# Output # Output
OUTPUT_EXIT_STATUS=0 OUTPUT_EXIT_STATUS=0
OUTPUT_DETAIL_WARNING="" OUTPUT_DETAIL_WARNING=""
@ -28,6 +30,9 @@ REVISION="0.1"
# Stop at the first non-catched error # Stop at the first non-catched error
set -e set -e
# Set/guess current page size
which getconf >/dev/null 2>&1 && PAGESIZE="$( getconf PAGESIZE )"
# Include check_range() # Include check_range()
. $PROGPATH/utils.sh . $PROGPATH/utils.sh
# If you don't have the previous file, just comment the line and uncomment the following # If you don't have the previous file, just comment the line and uncomment the following
@ -45,17 +50,18 @@ set -e
usage() { usage() {
cat <<EOF cat <<EOF
Usage : Usage :
$0 [-w warning_threshold] [-c critical_threshold] [-f vmstat_previous_data_file ] $0 [-w warning_threshold] [-c critical_threshold] [-f vmstat_previous_data_file ] [ -k oom_kill_threshold ] [ -K ]
Note 1 : the script will measure the number of seconds passed since its last call and will Note 1 : the script will measure the number of seconds passed since its last call and will
divide the measures accordingly, so write the thresholds using pages/s in mind. divide the measures accordingly, so write the thresholds using bytes/second in mind.
The script will measure against pswpin and pswpout added together. The script will measure against pswpin and pswpout added together.
Note 2 : the thresholds use the kernel page size as unit. Use 'getconf PAGESIZE' to get it if needed. Note 2 : if 'oom_kill' is not available in /proc/vmstat, you can disable the check with '-K'.
Default values: Default values:
warning_threshold : $THRESHOLD_WARNING warning_threshold : $THRESHOLD_WARNING
critical_threshold : $THRESHOLD_CRITICAL critical_threshold : $THRESHOLD_CRITICAL
oom_kill_threshold : $THRESHOLD_OOMKILL
vmstat_previous_data_file : $VMSTAT_PREVIOUS_DATA_FILE vmstat_previous_data_file : $VMSTAT_PREVIOUS_DATA_FILE
EOF EOF
} }
@ -89,7 +95,7 @@ is_int() {
# #
# Parameters management # Parameters management
# #
while getopts hw:c:f: f; do while getopts hw:c:f:k:K f; do
case "$f" in case "$f" in
'h') 'h')
usage usage
@ -108,6 +114,14 @@ while getopts hw:c:f: f; do
VMSTAT_PREVIOUS_DATA_FILE="$OPTARG" VMSTAT_PREVIOUS_DATA_FILE="$OPTARG"
;; ;;
'k')
THRESHOLD_OOMKILL="$OPTARG"
;;
'K')
DISABLE_OOMKILL="1"
;;
\?) \?)
usage usage
exit 1 exit 1
@ -117,10 +131,14 @@ done
shift $( expr $OPTIND - 1 ) shift $( expr $OPTIND - 1 )
# Little checks # Little checks
if ! is_int "$THRESHOLD_WARNING" || ! is_int "$THRESHOLD_CRITICAL"; then if ! is_int "$THRESHOLD_WARNING" || ! is_int "$THRESHOLD_CRITICAL" || ! is_int "$THRESHOLD_OOMKILL" ; then
echo "UNKNOWN invalid parameter : one of the threshold is not an integer." echo "UNKNOWN invalid parameter : one of the threshold is not an integer."
exit $STATE_UNKNOWN exit $STATE_UNKNOWN
fi fi
if [ -z "$DISABLE_OOMKILL" ] && ! grep '^oom_kill ' /proc/vmstat >/dev/null 2>&1; then
echo "UNKNOWN threshold set for oom_kill but the counter not available in /proc/vmstat."
exit $STATE_UNKNOWN
fi
# Check if the previous data file exists # Check if the previous data file exists
if [ ! -w "$VMSTAT_PREVIOUS_DATA_FILE" ]; then if [ ! -w "$VMSTAT_PREVIOUS_DATA_FILE" ]; then
@ -132,8 +150,8 @@ if [ ! -w "$VMSTAT_PREVIOUS_DATA_FILE" ]; then
fi fi
# Read previous data # Read previous data
PREVIOUS_PSWPIN="$( grep '^pswpin' "$VMSTAT_PREVIOUS_DATA_FILE" | cut -d " " -f 2 )" PREVIOUS_PSWPIN="$(( $( grep '^pswpin' "$VMSTAT_PREVIOUS_DATA_FILE" | cut -d " " -f 2 ) * $PAGESIZE ))"
PREVIOUS_PSWPOUT="$( grep '^pswpout' "$VMSTAT_PREVIOUS_DATA_FILE" | cut -d " " -f 2 )" PREVIOUS_PSWPOUT="$(( $( grep '^pswpout' "$VMSTAT_PREVIOUS_DATA_FILE" | cut -d " " -f 2 ) * $PAGESIZE ))"
PREVIOUS_PSWPTOTAL=$(( $PREVIOUS_PSWPIN + $PREVIOUS_PSWPOUT )) PREVIOUS_PSWPTOTAL=$(( $PREVIOUS_PSWPIN + $PREVIOUS_PSWPOUT ))
# Get time elapsed since last call # Get time elapsed since last call
@ -145,21 +163,28 @@ fi
# Update and read current data # Update and read current data
update_vmstat_previous_data_file "$VMSTAT_PREVIOUS_DATA_FILE" update_vmstat_previous_data_file "$VMSTAT_PREVIOUS_DATA_FILE"
CURRENT_PSWPIN="$( grep '^pswpin' "$VMSTAT_PREVIOUS_DATA_FILE" | cut -d " " -f 2 )" CURRENT_OOMKILL="$( grep '^oom_kill' "$VMSTAT_PREVIOUS_DATA_FILE" | cut -d " " -f 2 )"
CURRENT_PSWPOUT="$( grep '^pswpout' "$VMSTAT_PREVIOUS_DATA_FILE" | cut -d " " -f 2 )" CURRENT_PSWPIN="$(( $( grep '^pswpin' "$VMSTAT_PREVIOUS_DATA_FILE" | cut -d " " -f 2 ) * $PAGESIZE ))"
CURRENT_PSWPOUT="$(( $( grep '^pswpout' "$VMSTAT_PREVIOUS_DATA_FILE" | cut -d " " -f 2 ) * $PAGESIZE ))"
CURRENT_PSWPTOTAL=$(( $CURRENT_PSWPIN + $CURRENT_PSWPOUT )) CURRENT_PSWPTOTAL=$(( $CURRENT_PSWPIN + $CURRENT_PSWPOUT ))
# Calculate the swaping rate # Calculate the swaping rate
PSWP_RATE=$(( ( $CURRENT_PSWPTOTAL - $PREVIOUS_PSWPTOTAL ) / $PREVIOUS_DATA_AGE )) PSWP_RATE=$(( ( $CURRENT_PSWPTOTAL - $PREVIOUS_PSWPTOTAL ) / $PREVIOUS_DATA_AGE ))
# Generate perfdata # Generate perfdata
OUTPUT_PERFDATA="$( printf " pswpin=%d pswpout=%d" "$CURRENT_PSWPIN" "$CURRENT_PSWPOUT" )" if [ -n "$CURRENT_OOMKILL" ]; then
OUTPUT_PERFDATA="$( printf " oomkill=%d" "$CURRENT_OOMKILL" )"
fi
OUTPUT_PERFDATA="$( printf "$OUTPUT_PERFDATA pswpin=%d pswpout=%d" "$CURRENT_PSWPIN" "$CURRENT_PSWPOUT" )"
# Comparison # Comparison
# note: remember that numbers can be reseted to zero from time to time (64bits counter ?) # note: remember that numbers can be reseted to zero from time to time (64bits counter ?)
if [ $PSWP_RATE -gt $THRESHOLD_CRITICAL ]; then if [ $PSWP_RATE -gt $THRESHOLD_CRITICAL ]; then
echo "CRITICAL swaping rate at $PSWP_RATE (limit at $THRESHOLD_CRITICAL) |$OUTPUT_PERFDATA" echo "CRITICAL swaping rate at $PSWP_RATE (limit at $THRESHOLD_CRITICAL) |$OUTPUT_PERFDATA"
exit $STATE_CRITICAL exit $STATE_CRITICAL
elif [ -z "$DISABLE_OOMKILL" ] && [ $CURRENT_OOMKILL -gt $THRESHOLD_OOMKILL ]; then
echo "CRITICAL $CURRENT_OOMKILL OOM kill(s) (limit at $THRESHOLD_OOMKILL) |$OUTPUT_PERFDATA"
exit $STATE_CRITICAL
elif [ $PSWP_RATE -gt $THRESHOLD_WARNING ]; then elif [ $PSWP_RATE -gt $THRESHOLD_WARNING ]; then
echo "WARNING swaping rate at $PSWP_RATE (limit at $THRESHOLD_WARNING) |$OUTPUT_PERFDATA" echo "WARNING swaping rate at $PSWP_RATE (limit at $THRESHOLD_WARNING) |$OUTPUT_PERFDATA"
exit $STATE_WARNING exit $STATE_WARNING