nagios/check_swaping: adding check on oom_kill + change of unit
In order to standardize units, we use bytes instead of page/pagesize.
This commit is contained in:
parent
91a513481e
commit
68ec8f8812
1 changed files with 38 additions and 13 deletions
|
@ -11,11 +11,13 @@
|
||||||
|
|
||||||
|
|
||||||
# Default values
|
# Default values
|
||||||
THRESHOLD_WARNING="50"
|
PAGESIZE=4096
|
||||||
THRESHOLD_CRITICAL="200"
|
THRESHOLD_WARNING="200000"
|
||||||
|
THRESHOLD_CRITICAL="1000000"
|
||||||
|
THRESHOLD_OOMKILL="0"
|
||||||
|
DISABLE_OOMKILL=""
|
||||||
VMSTAT_PREVIOUS_DATA_FILE="/tmp/.monitoring_vmstat.txt"
|
VMSTAT_PREVIOUS_DATA_FILE="/tmp/.monitoring_vmstat.txt"
|
||||||
|
|
||||||
|
|
||||||
# Output
|
# Output
|
||||||
OUTPUT_EXIT_STATUS=0
|
OUTPUT_EXIT_STATUS=0
|
||||||
OUTPUT_DETAIL_WARNING=""
|
OUTPUT_DETAIL_WARNING=""
|
||||||
|
@ -28,6 +30,9 @@ REVISION="0.1"
|
||||||
# Stop at the first non-catched error
|
# Stop at the first non-catched error
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
|
# Set/guess current page size
|
||||||
|
which getconf >/dev/null 2>&1 && PAGESIZE="$( getconf PAGESIZE )"
|
||||||
|
|
||||||
# Include check_range()
|
# Include check_range()
|
||||||
. $PROGPATH/utils.sh
|
. $PROGPATH/utils.sh
|
||||||
# If you don't have the previous file, just comment the line and uncomment the following
|
# If you don't have the previous file, just comment the line and uncomment the following
|
||||||
|
@ -45,17 +50,18 @@ set -e
|
||||||
usage() {
|
usage() {
|
||||||
cat <<EOF
|
cat <<EOF
|
||||||
Usage :
|
Usage :
|
||||||
$0 [-w warning_threshold] [-c critical_threshold] [-f vmstat_previous_data_file ]
|
$0 [-w warning_threshold] [-c critical_threshold] [-f vmstat_previous_data_file ] [ -k oom_kill_threshold ] [ -K ]
|
||||||
|
|
||||||
Note 1 : the script will measure the number of seconds passed since its last call and will
|
Note 1 : the script will measure the number of seconds passed since its last call and will
|
||||||
divide the measures accordingly, so write the thresholds using pages/s in mind.
|
divide the measures accordingly, so write the thresholds using bytes/second in mind.
|
||||||
The script will measure against pswpin and pswpout added together.
|
The script will measure against pswpin and pswpout added together.
|
||||||
|
|
||||||
Note 2 : the thresholds use the kernel page size as unit. Use 'getconf PAGESIZE' to get it if needed.
|
Note 2 : if 'oom_kill' is not available in /proc/vmstat, you can disable the check with '-K'.
|
||||||
|
|
||||||
Default values:
|
Default values:
|
||||||
warning_threshold : $THRESHOLD_WARNING
|
warning_threshold : $THRESHOLD_WARNING
|
||||||
critical_threshold : $THRESHOLD_CRITICAL
|
critical_threshold : $THRESHOLD_CRITICAL
|
||||||
|
oom_kill_threshold : $THRESHOLD_OOMKILL
|
||||||
vmstat_previous_data_file : $VMSTAT_PREVIOUS_DATA_FILE
|
vmstat_previous_data_file : $VMSTAT_PREVIOUS_DATA_FILE
|
||||||
EOF
|
EOF
|
||||||
}
|
}
|
||||||
|
@ -89,7 +95,7 @@ is_int() {
|
||||||
#
|
#
|
||||||
# Parameters management
|
# Parameters management
|
||||||
#
|
#
|
||||||
while getopts hw:c:f: f; do
|
while getopts hw:c:f:k:K f; do
|
||||||
case "$f" in
|
case "$f" in
|
||||||
'h')
|
'h')
|
||||||
usage
|
usage
|
||||||
|
@ -108,6 +114,14 @@ while getopts hw:c:f: f; do
|
||||||
VMSTAT_PREVIOUS_DATA_FILE="$OPTARG"
|
VMSTAT_PREVIOUS_DATA_FILE="$OPTARG"
|
||||||
;;
|
;;
|
||||||
|
|
||||||
|
'k')
|
||||||
|
THRESHOLD_OOMKILL="$OPTARG"
|
||||||
|
;;
|
||||||
|
|
||||||
|
'K')
|
||||||
|
DISABLE_OOMKILL="1"
|
||||||
|
;;
|
||||||
|
|
||||||
\?)
|
\?)
|
||||||
usage
|
usage
|
||||||
exit 1
|
exit 1
|
||||||
|
@ -117,10 +131,14 @@ done
|
||||||
shift $( expr $OPTIND - 1 )
|
shift $( expr $OPTIND - 1 )
|
||||||
|
|
||||||
# Little checks
|
# Little checks
|
||||||
if ! is_int "$THRESHOLD_WARNING" || ! is_int "$THRESHOLD_CRITICAL"; then
|
if ! is_int "$THRESHOLD_WARNING" || ! is_int "$THRESHOLD_CRITICAL" || ! is_int "$THRESHOLD_OOMKILL" ; then
|
||||||
echo "UNKNOWN invalid parameter : one of the threshold is not an integer."
|
echo "UNKNOWN invalid parameter : one of the threshold is not an integer."
|
||||||
exit $STATE_UNKNOWN
|
exit $STATE_UNKNOWN
|
||||||
fi
|
fi
|
||||||
|
if [ -z "$DISABLE_OOMKILL" ] && ! grep '^oom_kill ' /proc/vmstat >/dev/null 2>&1; then
|
||||||
|
echo "UNKNOWN threshold set for oom_kill but the counter not available in /proc/vmstat."
|
||||||
|
exit $STATE_UNKNOWN
|
||||||
|
fi
|
||||||
|
|
||||||
# Check if the previous data file exists
|
# Check if the previous data file exists
|
||||||
if [ ! -w "$VMSTAT_PREVIOUS_DATA_FILE" ]; then
|
if [ ! -w "$VMSTAT_PREVIOUS_DATA_FILE" ]; then
|
||||||
|
@ -132,8 +150,8 @@ if [ ! -w "$VMSTAT_PREVIOUS_DATA_FILE" ]; then
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Read previous data
|
# Read previous data
|
||||||
PREVIOUS_PSWPIN="$( grep '^pswpin' "$VMSTAT_PREVIOUS_DATA_FILE" | cut -d " " -f 2 )"
|
PREVIOUS_PSWPIN="$(( $( grep '^pswpin' "$VMSTAT_PREVIOUS_DATA_FILE" | cut -d " " -f 2 ) * $PAGESIZE ))"
|
||||||
PREVIOUS_PSWPOUT="$( grep '^pswpout' "$VMSTAT_PREVIOUS_DATA_FILE" | cut -d " " -f 2 )"
|
PREVIOUS_PSWPOUT="$(( $( grep '^pswpout' "$VMSTAT_PREVIOUS_DATA_FILE" | cut -d " " -f 2 ) * $PAGESIZE ))"
|
||||||
PREVIOUS_PSWPTOTAL=$(( $PREVIOUS_PSWPIN + $PREVIOUS_PSWPOUT ))
|
PREVIOUS_PSWPTOTAL=$(( $PREVIOUS_PSWPIN + $PREVIOUS_PSWPOUT ))
|
||||||
|
|
||||||
# Get time elapsed since last call
|
# Get time elapsed since last call
|
||||||
|
@ -145,21 +163,28 @@ fi
|
||||||
|
|
||||||
# Update and read current data
|
# Update and read current data
|
||||||
update_vmstat_previous_data_file "$VMSTAT_PREVIOUS_DATA_FILE"
|
update_vmstat_previous_data_file "$VMSTAT_PREVIOUS_DATA_FILE"
|
||||||
CURRENT_PSWPIN="$( grep '^pswpin' "$VMSTAT_PREVIOUS_DATA_FILE" | cut -d " " -f 2 )"
|
CURRENT_OOMKILL="$( grep '^oom_kill' "$VMSTAT_PREVIOUS_DATA_FILE" | cut -d " " -f 2 )"
|
||||||
CURRENT_PSWPOUT="$( grep '^pswpout' "$VMSTAT_PREVIOUS_DATA_FILE" | cut -d " " -f 2 )"
|
CURRENT_PSWPIN="$(( $( grep '^pswpin' "$VMSTAT_PREVIOUS_DATA_FILE" | cut -d " " -f 2 ) * $PAGESIZE ))"
|
||||||
|
CURRENT_PSWPOUT="$(( $( grep '^pswpout' "$VMSTAT_PREVIOUS_DATA_FILE" | cut -d " " -f 2 ) * $PAGESIZE ))"
|
||||||
CURRENT_PSWPTOTAL=$(( $CURRENT_PSWPIN + $CURRENT_PSWPOUT ))
|
CURRENT_PSWPTOTAL=$(( $CURRENT_PSWPIN + $CURRENT_PSWPOUT ))
|
||||||
|
|
||||||
# Calculate the swaping rate
|
# Calculate the swaping rate
|
||||||
PSWP_RATE=$(( ( $CURRENT_PSWPTOTAL - $PREVIOUS_PSWPTOTAL ) / $PREVIOUS_DATA_AGE ))
|
PSWP_RATE=$(( ( $CURRENT_PSWPTOTAL - $PREVIOUS_PSWPTOTAL ) / $PREVIOUS_DATA_AGE ))
|
||||||
|
|
||||||
# Generate perfdata
|
# Generate perfdata
|
||||||
OUTPUT_PERFDATA="$( printf " pswpin=%d pswpout=%d" "$CURRENT_PSWPIN" "$CURRENT_PSWPOUT" )"
|
if [ -n "$CURRENT_OOMKILL" ]; then
|
||||||
|
OUTPUT_PERFDATA="$( printf " oomkill=%d" "$CURRENT_OOMKILL" )"
|
||||||
|
fi
|
||||||
|
OUTPUT_PERFDATA="$( printf "$OUTPUT_PERFDATA pswpin=%d pswpout=%d" "$CURRENT_PSWPIN" "$CURRENT_PSWPOUT" )"
|
||||||
|
|
||||||
# Comparison
|
# Comparison
|
||||||
# note: remember that numbers can be reseted to zero from time to time (64bits counter ?)
|
# note: remember that numbers can be reseted to zero from time to time (64bits counter ?)
|
||||||
if [ $PSWP_RATE -gt $THRESHOLD_CRITICAL ]; then
|
if [ $PSWP_RATE -gt $THRESHOLD_CRITICAL ]; then
|
||||||
echo "CRITICAL swaping rate at $PSWP_RATE (limit at $THRESHOLD_CRITICAL) |$OUTPUT_PERFDATA"
|
echo "CRITICAL swaping rate at $PSWP_RATE (limit at $THRESHOLD_CRITICAL) |$OUTPUT_PERFDATA"
|
||||||
exit $STATE_CRITICAL
|
exit $STATE_CRITICAL
|
||||||
|
elif [ -z "$DISABLE_OOMKILL" ] && [ $CURRENT_OOMKILL -gt $THRESHOLD_OOMKILL ]; then
|
||||||
|
echo "CRITICAL $CURRENT_OOMKILL OOM kill(s) (limit at $THRESHOLD_OOMKILL) |$OUTPUT_PERFDATA"
|
||||||
|
exit $STATE_CRITICAL
|
||||||
elif [ $PSWP_RATE -gt $THRESHOLD_WARNING ]; then
|
elif [ $PSWP_RATE -gt $THRESHOLD_WARNING ]; then
|
||||||
echo "WARNING swaping rate at $PSWP_RATE (limit at $THRESHOLD_WARNING) |$OUTPUT_PERFDATA"
|
echo "WARNING swaping rate at $PSWP_RATE (limit at $THRESHOLD_WARNING) |$OUTPUT_PERFDATA"
|
||||||
exit $STATE_WARNING
|
exit $STATE_WARNING
|
||||||
|
|
Loading…
Reference in a new issue