1
0
Fork 0
scripts-admin-quickndirty-p.../nagios/check_swaping.sh
Chl 68ec8f8812 nagios/check_swaping: adding check on oom_kill + change of unit
In order to standardize units, we use bytes instead of page/pagesize.
2021-09-17 00:22:13 +02:00

195 lines
5.5 KiB
Bash
Executable file
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/sh
# Little custom script to :
# - check if the system has a heavy swap activity
# - record swapin activity as perfdata
#
# For pnp4nagios' sake, create a file usually named 'check_nrpe_swaping.cfg'
# (depends on your configuration) with the following content :
# DATATYPE = COUNTER,COUNTER
# Default values
PAGESIZE=4096
THRESHOLD_WARNING="200000"
THRESHOLD_CRITICAL="1000000"
THRESHOLD_OOMKILL="0"
DISABLE_OOMKILL=""
VMSTAT_PREVIOUS_DATA_FILE="/tmp/.monitoring_vmstat.txt"
# Output
OUTPUT_EXIT_STATUS=0
OUTPUT_DETAIL_WARNING=""
OUTPUT_DETAIL_CRITICAL=""
OUTPUT_PERFDATA=""
PROGPATH=$( echo $0 | sed -e 's,[\\/][^\\/][^\\/]*$,,' )
REVISION="0.1"
# Stop at the first non-catched error
set -e
# Set/guess current page size
which getconf >/dev/null 2>&1 && PAGESIZE="$( getconf PAGESIZE )"
# Include check_range()
. $PROGPATH/utils.sh
# If you don't have the previous file, just comment the line and uncomment the following
# (it's only possible because we don't use the range checking function in this script)
#STATE_OK=0
#STATE_WARNING=1
#STATE_CRITICAL=2
#STATE_UNKNOWN=3
#STATE_DEPENDENT=4
#
# Help function
#
usage() {
cat <<EOF
Usage :
$0 [-w warning_threshold] [-c critical_threshold] [-f vmstat_previous_data_file ] [ -k oom_kill_threshold ] [ -K ]
Note 1 : the script will measure the number of seconds passed since its last call and will
divide the measures accordingly, so write the thresholds using bytes/second in mind.
The script will measure against pswpin and pswpout added together.
Note 2 : if 'oom_kill' is not available in /proc/vmstat, you can disable the check with '-K'.
Default values:
warning_threshold : $THRESHOLD_WARNING
critical_threshold : $THRESHOLD_CRITICAL
oom_kill_threshold : $THRESHOLD_OOMKILL
vmstat_previous_data_file : $VMSTAT_PREVIOUS_DATA_FILE
EOF
}
#
# Copy fresh vmstat data into the file passed as an argument
#
update_vmstat_previous_data_file() {
# Harden default files permissions to avoid some data leaks
umask "0077" || true
echo "# This file was written by $0 for monitoring swap activity." >"$1"
echo "# It can be deleted if the need arise, it will be easily recreated without too much lost." >>"$1"
echo "" >>"$1"
cat /proc/vmstat >> "$1"
}
#
# Check if arg is an integer
# (copied from jilles @ http://stackoverflow.com/questions/806906/how-do-i-test-if-a-variable-is-a-number-in-bash )
#
is_int() {
case "$1" in
''|*[!0-9]*) return 1;;
*) return 0;;
esac
}
#
# Parameters management
#
while getopts hw:c:f:k:K f; do
case "$f" in
'h')
usage
exit
;;
'w')
THRESHOLD_WARNING="$OPTARG"
;;
'c')
THRESHOLD_CRITICAL="$OPTARG"
;;
'f')
VMSTAT_PREVIOUS_DATA_FILE="$OPTARG"
;;
'k')
THRESHOLD_OOMKILL="$OPTARG"
;;
'K')
DISABLE_OOMKILL="1"
;;
\?)
usage
exit 1
;;
esac
done
shift $( expr $OPTIND - 1 )
# Little checks
if ! is_int "$THRESHOLD_WARNING" || ! is_int "$THRESHOLD_CRITICAL" || ! is_int "$THRESHOLD_OOMKILL" ; then
echo "UNKNOWN invalid parameter : one of the threshold is not an integer."
exit $STATE_UNKNOWN
fi
if [ -z "$DISABLE_OOMKILL" ] && ! grep '^oom_kill ' /proc/vmstat >/dev/null 2>&1; then
echo "UNKNOWN threshold set for oom_kill but the counter not available in /proc/vmstat."
exit $STATE_UNKNOWN
fi
# Check if the previous data file exists
if [ ! -w "$VMSTAT_PREVIOUS_DATA_FILE" ]; then
update_vmstat_previous_data_file "$VMSTAT_PREVIOUS_DATA_FILE"
# We wait a little bit to gather some data even on the first run
# (or we could return an UNKNOWN ?)
sleep 2
fi
# Read previous data
PREVIOUS_PSWPIN="$(( $( grep '^pswpin' "$VMSTAT_PREVIOUS_DATA_FILE" | cut -d " " -f 2 ) * $PAGESIZE ))"
PREVIOUS_PSWPOUT="$(( $( grep '^pswpout' "$VMSTAT_PREVIOUS_DATA_FILE" | cut -d " " -f 2 ) * $PAGESIZE ))"
PREVIOUS_PSWPTOTAL=$(( $PREVIOUS_PSWPIN + $PREVIOUS_PSWPOUT ))
# Get time elapsed since last call
PREVIOUS_DATA_AGE=$(( $( date +%s ) - $( stat --printf="%Y" "$VMSTAT_PREVIOUS_DATA_FILE" ) ))
if [ "$PREVIOUS_DATA_AGE" -le "0" ]; then
echo "UNKNOWN: $PREVIOUS_DATA_AGE second(s) elapsed since last call."
exit $STATE_UNKNOWN
fi
# Update and read current data
update_vmstat_previous_data_file "$VMSTAT_PREVIOUS_DATA_FILE"
CURRENT_OOMKILL="$( grep '^oom_kill' "$VMSTAT_PREVIOUS_DATA_FILE" | cut -d " " -f 2 )"
CURRENT_PSWPIN="$(( $( grep '^pswpin' "$VMSTAT_PREVIOUS_DATA_FILE" | cut -d " " -f 2 ) * $PAGESIZE ))"
CURRENT_PSWPOUT="$(( $( grep '^pswpout' "$VMSTAT_PREVIOUS_DATA_FILE" | cut -d " " -f 2 ) * $PAGESIZE ))"
CURRENT_PSWPTOTAL=$(( $CURRENT_PSWPIN + $CURRENT_PSWPOUT ))
# Calculate the swaping rate
PSWP_RATE=$(( ( $CURRENT_PSWPTOTAL - $PREVIOUS_PSWPTOTAL ) / $PREVIOUS_DATA_AGE ))
# Generate perfdata
if [ -n "$CURRENT_OOMKILL" ]; then
OUTPUT_PERFDATA="$( printf " oomkill=%d" "$CURRENT_OOMKILL" )"
fi
OUTPUT_PERFDATA="$( printf "$OUTPUT_PERFDATA pswpin=%d pswpout=%d" "$CURRENT_PSWPIN" "$CURRENT_PSWPOUT" )"
# Comparison
# note: remember that numbers can be reseted to zero from time to time (64bits counter ?)
if [ $PSWP_RATE -gt $THRESHOLD_CRITICAL ]; then
echo "CRITICAL swaping rate at $PSWP_RATE (limit at $THRESHOLD_CRITICAL) |$OUTPUT_PERFDATA"
exit $STATE_CRITICAL
elif [ -z "$DISABLE_OOMKILL" ] && [ $CURRENT_OOMKILL -gt $THRESHOLD_OOMKILL ]; then
echo "CRITICAL $CURRENT_OOMKILL OOM kill(s) (limit at $THRESHOLD_OOMKILL) |$OUTPUT_PERFDATA"
exit $STATE_CRITICAL
elif [ $PSWP_RATE -gt $THRESHOLD_WARNING ]; then
echo "WARNING swaping rate at $PSWP_RATE (limit at $THRESHOLD_WARNING) |$OUTPUT_PERFDATA"
exit $STATE_WARNING
fi
echo "OK swaping rate at $PSWP_RATE |$OUTPUT_PERFDATA"
exit $STATE_OK