diff --git a/nagios/check_swaping.sh b/nagios/check_swaping.sh index 36f5970..acf23d3 100755 --- a/nagios/check_swaping.sh +++ b/nagios/check_swaping.sh @@ -11,11 +11,13 @@ # Default values -THRESHOLD_WARNING="50" -THRESHOLD_CRITICAL="200" +PAGESIZE=4096 +THRESHOLD_WARNING="200000" +THRESHOLD_CRITICAL="1000000" +THRESHOLD_OOMKILL="0" +DISABLE_OOMKILL="" VMSTAT_PREVIOUS_DATA_FILE="/tmp/.monitoring_vmstat.txt" - # Output OUTPUT_EXIT_STATUS=0 OUTPUT_DETAIL_WARNING="" @@ -28,6 +30,9 @@ REVISION="0.1" # Stop at the first non-catched error set -e +# Set/guess current page size +which getconf >/dev/null 2>&1 && PAGESIZE="$( getconf PAGESIZE )" + # Include check_range() . $PROGPATH/utils.sh # If you don't have the previous file, just comment the line and uncomment the following @@ -45,17 +50,18 @@ set -e usage() { cat </dev/null 2>&1; then + echo "UNKNOWN threshold set for oom_kill but the counter not available in /proc/vmstat." + exit $STATE_UNKNOWN +fi # Check if the previous data file exists if [ ! -w "$VMSTAT_PREVIOUS_DATA_FILE" ]; then @@ -132,8 +150,8 @@ if [ ! -w "$VMSTAT_PREVIOUS_DATA_FILE" ]; then fi # Read previous data -PREVIOUS_PSWPIN="$( grep '^pswpin' "$VMSTAT_PREVIOUS_DATA_FILE" | cut -d " " -f 2 )" -PREVIOUS_PSWPOUT="$( grep '^pswpout' "$VMSTAT_PREVIOUS_DATA_FILE" | cut -d " " -f 2 )" +PREVIOUS_PSWPIN="$(( $( grep '^pswpin' "$VMSTAT_PREVIOUS_DATA_FILE" | cut -d " " -f 2 ) * $PAGESIZE ))" +PREVIOUS_PSWPOUT="$(( $( grep '^pswpout' "$VMSTAT_PREVIOUS_DATA_FILE" | cut -d " " -f 2 ) * $PAGESIZE ))" PREVIOUS_PSWPTOTAL=$(( $PREVIOUS_PSWPIN + $PREVIOUS_PSWPOUT )) # Get time elapsed since last call @@ -145,21 +163,28 @@ fi # Update and read current data update_vmstat_previous_data_file "$VMSTAT_PREVIOUS_DATA_FILE" -CURRENT_PSWPIN="$( grep '^pswpin' "$VMSTAT_PREVIOUS_DATA_FILE" | cut -d " " -f 2 )" -CURRENT_PSWPOUT="$( grep '^pswpout' "$VMSTAT_PREVIOUS_DATA_FILE" | cut -d " " -f 2 )" +CURRENT_OOMKILL="$( grep '^oom_kill' "$VMSTAT_PREVIOUS_DATA_FILE" | cut -d " " -f 2 )" +CURRENT_PSWPIN="$(( $( grep '^pswpin' "$VMSTAT_PREVIOUS_DATA_FILE" | cut -d " " -f 2 ) * $PAGESIZE ))" +CURRENT_PSWPOUT="$(( $( grep '^pswpout' "$VMSTAT_PREVIOUS_DATA_FILE" | cut -d " " -f 2 ) * $PAGESIZE ))" CURRENT_PSWPTOTAL=$(( $CURRENT_PSWPIN + $CURRENT_PSWPOUT )) # Calculate the swaping rate PSWP_RATE=$(( ( $CURRENT_PSWPTOTAL - $PREVIOUS_PSWPTOTAL ) / $PREVIOUS_DATA_AGE )) # Generate perfdata -OUTPUT_PERFDATA="$( printf " pswpin=%d pswpout=%d" "$CURRENT_PSWPIN" "$CURRENT_PSWPOUT" )" +if [ -n "$CURRENT_OOMKILL" ]; then + OUTPUT_PERFDATA="$( printf " oomkill=%d" "$CURRENT_OOMKILL" )" +fi +OUTPUT_PERFDATA="$( printf "$OUTPUT_PERFDATA pswpin=%d pswpout=%d" "$CURRENT_PSWPIN" "$CURRENT_PSWPOUT" )" # Comparison # note: remember that numbers can be reseted to zero from time to time (64bits counter ?) if [ $PSWP_RATE -gt $THRESHOLD_CRITICAL ]; then echo "CRITICAL swaping rate at $PSWP_RATE (limit at $THRESHOLD_CRITICAL) |$OUTPUT_PERFDATA" exit $STATE_CRITICAL +elif [ -z "$DISABLE_OOMKILL" ] && [ $CURRENT_OOMKILL -gt $THRESHOLD_OOMKILL ]; then + echo "CRITICAL $CURRENT_OOMKILL OOM kill(s) (limit at $THRESHOLD_OOMKILL) |$OUTPUT_PERFDATA" + exit $STATE_CRITICAL elif [ $PSWP_RATE -gt $THRESHOLD_WARNING ]; then echo "WARNING swaping rate at $PSWP_RATE (limit at $THRESHOLD_WARNING) |$OUTPUT_PERFDATA" exit $STATE_WARNING