1
0
Fork 0
scripts-admin-quickndirty-p.../nagios/check_swaping.sh

195 lines
5.5 KiB
Bash
Raw Permalink Normal View History

2021-08-23 01:18:31 +02:00
#!/bin/sh
# Little custom script to :
# - check if the system has a heavy swap activity
# - record swapin activity as perfdata
#
# For pnp4nagios' sake, create a file usually named 'check_nrpe_swaping.cfg'
# (depends on your configuration) with the following content :
# DATATYPE = COUNTER,COUNTER
# Default values
PAGESIZE=4096
THRESHOLD_WARNING="200000"
THRESHOLD_CRITICAL="1000000"
THRESHOLD_OOMKILL="0"
DISABLE_OOMKILL=""
2021-08-23 01:18:31 +02:00
VMSTAT_PREVIOUS_DATA_FILE="/tmp/.monitoring_vmstat.txt"
# Output
OUTPUT_EXIT_STATUS=0
OUTPUT_DETAIL_WARNING=""
OUTPUT_DETAIL_CRITICAL=""
OUTPUT_PERFDATA=""
PROGPATH=$( echo $0 | sed -e 's,[\\/][^\\/][^\\/]*$,,' )
REVISION="0.1"
# Stop at the first non-catched error
set -e
# Set/guess current page size
which getconf >/dev/null 2>&1 && PAGESIZE="$( getconf PAGESIZE )"
2021-08-23 01:18:31 +02:00
# Include check_range()
. $PROGPATH/utils.sh
# If you don't have the previous file, just comment the line and uncomment the following
# (it's only possible because we don't use the range checking function in this script)
#STATE_OK=0
#STATE_WARNING=1
#STATE_CRITICAL=2
#STATE_UNKNOWN=3
#STATE_DEPENDENT=4
#
# Help function
#
usage() {
cat <<EOF
Usage :
$0 [-w warning_threshold] [-c critical_threshold] [-f vmstat_previous_data_file ] [ -k oom_kill_threshold ] [ -K ]
2021-08-23 01:18:31 +02:00
Note 1 : the script will measure the number of seconds passed since its last call and will
divide the measures accordingly, so write the thresholds using bytes/second in mind.
2021-08-23 01:18:31 +02:00
The script will measure against pswpin and pswpout added together.
Note 2 : if 'oom_kill' is not available in /proc/vmstat, you can disable the check with '-K'.
2021-08-23 01:18:31 +02:00
Default values:
warning_threshold : $THRESHOLD_WARNING
critical_threshold : $THRESHOLD_CRITICAL
oom_kill_threshold : $THRESHOLD_OOMKILL
2021-08-23 01:18:31 +02:00
vmstat_previous_data_file : $VMSTAT_PREVIOUS_DATA_FILE
EOF
}
#
# Copy fresh vmstat data into the file passed as an argument
#
update_vmstat_previous_data_file() {
# Harden default files permissions to avoid some data leaks
umask "0077" || true
echo "# This file was written by $0 for monitoring swap activity." >"$1"
echo "# It can be deleted if the need arise, it will be easily recreated without too much lost." >>"$1"
echo "" >>"$1"
cat /proc/vmstat >> "$1"
}
#
# Check if arg is an integer
# (copied from jilles @ http://stackoverflow.com/questions/806906/how-do-i-test-if-a-variable-is-a-number-in-bash )
#
is_int() {
case "$1" in
''|*[!0-9]*) return 1;;
*) return 0;;
esac
}
#
# Parameters management
#
while getopts hw:c:f:k:K f; do
2021-08-23 01:18:31 +02:00
case "$f" in
'h')
usage
exit
;;
'w')
THRESHOLD_WARNING="$OPTARG"
;;
'c')
THRESHOLD_CRITICAL="$OPTARG"
;;
'f')
VMSTAT_PREVIOUS_DATA_FILE="$OPTARG"
;;
'k')
THRESHOLD_OOMKILL="$OPTARG"
;;
'K')
DISABLE_OOMKILL="1"
;;
2021-08-23 01:18:31 +02:00
\?)
usage
exit 1
;;
esac
done
shift $( expr $OPTIND - 1 )
# Little checks
if ! is_int "$THRESHOLD_WARNING" || ! is_int "$THRESHOLD_CRITICAL" || ! is_int "$THRESHOLD_OOMKILL" ; then
2021-08-23 01:18:31 +02:00
echo "UNKNOWN invalid parameter : one of the threshold is not an integer."
exit $STATE_UNKNOWN
fi
if [ -z "$DISABLE_OOMKILL" ] && ! grep '^oom_kill ' /proc/vmstat >/dev/null 2>&1; then
echo "UNKNOWN threshold set for oom_kill but the counter not available in /proc/vmstat."
exit $STATE_UNKNOWN
fi
2021-08-23 01:18:31 +02:00
# Check if the previous data file exists
if [ ! -w "$VMSTAT_PREVIOUS_DATA_FILE" ]; then
update_vmstat_previous_data_file "$VMSTAT_PREVIOUS_DATA_FILE"
# We wait a little bit to gather some data even on the first run
# (or we could return an UNKNOWN ?)
sleep 2
fi
# Read previous data
PREVIOUS_PSWPIN="$(( $( grep '^pswpin' "$VMSTAT_PREVIOUS_DATA_FILE" | cut -d " " -f 2 ) * $PAGESIZE ))"
PREVIOUS_PSWPOUT="$(( $( grep '^pswpout' "$VMSTAT_PREVIOUS_DATA_FILE" | cut -d " " -f 2 ) * $PAGESIZE ))"
2021-08-23 01:18:31 +02:00
PREVIOUS_PSWPTOTAL=$(( $PREVIOUS_PSWPIN + $PREVIOUS_PSWPOUT ))
# Get time elapsed since last call
PREVIOUS_DATA_AGE=$(( $( date +%s ) - $( stat --printf="%Y" "$VMSTAT_PREVIOUS_DATA_FILE" ) ))
if [ "$PREVIOUS_DATA_AGE" -le "0" ]; then
echo "UNKNOWN: $PREVIOUS_DATA_AGE second(s) elapsed since last call."
exit $STATE_UNKNOWN
fi
# Update and read current data
update_vmstat_previous_data_file "$VMSTAT_PREVIOUS_DATA_FILE"
CURRENT_OOMKILL="$( grep '^oom_kill' "$VMSTAT_PREVIOUS_DATA_FILE" | cut -d " " -f 2 )"
CURRENT_PSWPIN="$(( $( grep '^pswpin' "$VMSTAT_PREVIOUS_DATA_FILE" | cut -d " " -f 2 ) * $PAGESIZE ))"
CURRENT_PSWPOUT="$(( $( grep '^pswpout' "$VMSTAT_PREVIOUS_DATA_FILE" | cut -d " " -f 2 ) * $PAGESIZE ))"
2021-08-23 01:18:31 +02:00
CURRENT_PSWPTOTAL=$(( $CURRENT_PSWPIN + $CURRENT_PSWPOUT ))
# Calculate the swaping rate
PSWP_RATE=$(( ( $CURRENT_PSWPTOTAL - $PREVIOUS_PSWPTOTAL ) / $PREVIOUS_DATA_AGE ))
# Generate perfdata
if [ -n "$CURRENT_OOMKILL" ]; then
OUTPUT_PERFDATA="$( printf " oomkill=%d" "$CURRENT_OOMKILL" )"
fi
OUTPUT_PERFDATA="$( printf "$OUTPUT_PERFDATA pswpin=%d pswpout=%d" "$CURRENT_PSWPIN" "$CURRENT_PSWPOUT" )"
2021-08-23 01:18:31 +02:00
# Comparison
# note: remember that numbers can be reseted to zero from time to time (64bits counter ?)
if [ $PSWP_RATE -gt $THRESHOLD_CRITICAL ]; then
echo "CRITICAL swaping rate at $PSWP_RATE (limit at $THRESHOLD_CRITICAL) |$OUTPUT_PERFDATA"
exit $STATE_CRITICAL
elif [ -z "$DISABLE_OOMKILL" ] && [ $CURRENT_OOMKILL -gt $THRESHOLD_OOMKILL ]; then
echo "CRITICAL $CURRENT_OOMKILL OOM kill(s) (limit at $THRESHOLD_OOMKILL) |$OUTPUT_PERFDATA"
exit $STATE_CRITICAL
2021-08-23 01:18:31 +02:00
elif [ $PSWP_RATE -gt $THRESHOLD_WARNING ]; then
echo "WARNING swaping rate at $PSWP_RATE (limit at $THRESHOLD_WARNING) |$OUTPUT_PERFDATA"
exit $STATE_WARNING
fi
echo "OK swaping rate at $PSWP_RATE |$OUTPUT_PERFDATA"
exit $STATE_OK