1
0
Fork 0
scripts-admin-quickndirty-p.../nagios/check_diskstats.sh

169 lines
5.9 KiB
Bash
Executable file
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/sh
# Little Nagios/Icinga/Naemon probe for Linux disks' I/O via /proc/diskstats
#
# Based on https://www.kernel.org/doc/html/latest/admin-guide/iostats.html
#
# Published under Unlicense <http://unlicense.org/> (similar to public domain)
PROGPATH=$( echo $0 | sed -e 's,[\\/][^\\/][^\\/]*$,,' )
REVISION="0.1"
# Stop at the first non-catched error
set -e
# Include check_range()
# Not needed at the moment
#. $PROGPATH/utils.sh
STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
STATE_UNKNOWN=3
STATE_DEPENDENT=4
# Default values
WARNING_STATS=0
CRITICAL_STATS=0
# Output
OUTPUT_EXIT_STATUS=$STATE_OK
OUTPUT_DETAIL_OK=""
OUTPUT_DETAIL_WARNING=""
OUTPUT_DETAIL_CRITICAL=""
OUTPUT_PERFDATA=""
#
# Help function
#
usage() {
cat <<EOF
Usage :
$0 [ -i <nb_in_progress_warn_threshold> ] [ -I <nb_in_progress_crit_threshold> ] -d <device_name> [ ... -d ... ]
This probe send statistics about disks on a Linux system.
At the moment, the "number of IO in progress" (field #9) is the only metric
able to generate an alert.
Options :
-d xx Device name (usually 'sda'). Can use shell pattern like 'sda*' or 'sd?'.
-i N Number of io in progress : warning threshold
-I N Number of io in progress : critical threshold
Example :
This example is a good generic configuration for simple and light systems,
whether physical or virtual, taking any sda, sdb, vda, vdb, etc. disks found
and triggering a warning at 1 IO in progress and an alert at 5.
$0 -w 0 -W 5 -d 'sd?' -d 'vd?'
EOF
}
# Some early checks
if [ ! -r "/proc/diskstats" ]; then
echo "UNKNOWN file '/proc/diskstats' not found or not readable."
exit $STATE_UNKNOWN
fi
# TODO: check Linux kernel version. See differences between 2.4+, 2.6+, 4.18+
# https://www.kernel.org/doc/html/latest/admin-guide/iostats.html
#
# Loop on parameters + tests
#
while getopts hd:i:I: OPT; do
case "$OPT" in
'd')
DEVICE_FOUND=0
while read \
MAJOR MINOR DEVICE_NAME \
READS_COMPLETED READS_MERGED SECTORS_READ TIME_SPENT_READING \
WRITES_COMPLETED WRITES_MERGED SECTORS_WRITTEN TIME_SPENT_WRITING \
IO_IN_PROGRESS \
TIME_SPENT_IO TIME_SPENT_IO_WEIGHTED \
DISCARDS_MERGED SECTORS_DISCARDED TIME_SPENT_DISCARDING \
FLUSH_REQUESTS TIME_SPENT_FLUSHING \
FUTURE_THINGIES
do
case "$DEVICE_NAME" in
$OPTARG)
DEVICE_FOUND=1
# Checking the thresholds
if [ -n "$THRESHOLD_IO_IN_PROGRESS_CRITICAL" ] && [ "$IO_IN_PROGRESS" -gt "$THRESHOLD_IO_IN_PROGRESS_CRITICAL" ]; then
OUTPUT_EXIT_STATUS=$STATE_CRITICAL
OUTPUT_DETAIL_CRITICAL="$OUTPUT_DETAIL_CRITICAL I/O in progress on $DEVICE_NAME: $IO_IN_PROGRESS"
fi
if [ -n "$THRESHOLD_IO_IN_PROGRESS_WARNING" ] && [ "$OUTPUT_EXIT_STATUS" != "$STATE_CRITICAL" ] && [ "$IO_IN_PROGRESS" -gt "$THRESHOLD_IO_IN_PROGRESS_WARNING" ]; then
OUTPUT_EXIT_STATUS=$STATE_WARNING
OUTPUT_DETAIL_WARNING="$OUTPUT_DETAIL_WARNING I/O in progress on $DEVICE_NAME: $IO_IN_PROGRESS"
fi
# Generating performance data
# reads
OUTPUT_PERFDATA="$( printf "%s\n'%s_reads_completed'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$READS_COMPLETED" )"
OUTPUT_PERFDATA="$( printf "%s\n'%s_reads_merged'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$READS_MERGED" )"
OUTPUT_PERFDATA="$( printf "%s\n'%s_sectors_read'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$SECTORS_READ" )"
OUTPUT_PERFDATA="$( printf "%s\n'%s_time_spent_reading'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$TIME_SPENT_READING" )"
# writes
OUTPUT_PERFDATA="$( printf "%s\n'%s_writes_completed'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$WRITES_COMPLETED" )"
OUTPUT_PERFDATA="$( printf "%s\n'%s_writes_merged'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$WRITES_MERGED" )"
OUTPUT_PERFDATA="$( printf "%s\n'%s_sectors_written'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$SECTORS_WRITTEN" )"
OUTPUT_PERFDATA="$( printf "%s\n'%s_time_spent_writing'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$TIME_SPENT_WRITING" )"
# I/O
OUTPUT_PERFDATA="$( printf "%s\n'%s_io_in_progress'=%d;%s;%s;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$IO_IN_PROGRESS" "$THRESHOLD_IO_IN_PROGRESS_WARNING" "$THRESHOLD_IO_IN_PROGRESS_CRITICAL" )"
OUTPUT_PERFDATA="$( printf "%s\n'%s_time_spent_io'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$TIME_SPENT_IO" )"
OUTPUT_PERFDATA="$( printf "%s\n'%s_time_spent_io_weighted'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$TIME_SPENT_IO_WEIGHTED" )"
# Discards
OUTPUT_PERFDATA="$( printf "%s\n'%s_discards_merged'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$DISCARDS_MERGED" )"
OUTPUT_PERFDATA="$( printf "%s\n'%s_sectors_discarded'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$SECTORS_DISCARDED" )"
OUTPUT_PERFDATA="$( printf "%s\n'%s_time_spent_discarding'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$TIME_SPENT_DISCARDING" )"
# Flushes
OUTPUT_PERFDATA="$( printf "%s\n'%s_flush_requests'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$FLUSH_REQUESTS" )"
OUTPUT_PERFDATA="$( printf "%s\n'%s_time_spent_flushing'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$TIME_SPENT_FLUSHING" )"
;;
esac
done < /proc/diskstats
# Check if the device requested has been found.
if [ "$DEVICE_FOUND" -lt 1 ]; then
echo "UNKNOWN device '$OPTARG' not found."
exit $STATE_UNKNOWN
fi
;;
'h')
usage
exit 0
;;
'i')
THRESHOLD_IO_IN_PROGRESS_WARNING="$OPTARG"
;;
'I')
THRESHOLD_IO_IN_PROGRESS_CRITICAL="$OPTARG"
;;
\?)
usage
exit 1
;;
esac
done
# Should we check if no device has been requested ? Overkill ?
# Final ouput
case "$OUTPUT_EXIT_STATUS" in
'0')
printf "OK%s" "$OUTPUT_DETAIL_OK"
;;
'1')
printf "WARNING%s" "$OUTPUT_DETAIL_WARNING"
;;
'2')
printf "CRITICAL%s" "$OUTPUT_DETAIL_CRITICAL"
;;
*)
printf "UNKNOWN"
;;
esac
# We sort the perfdata because of a weird bug in some icinga/pnp4nagios interactions
printf "|%s\n" "$( printf "%s" "$OUTPUT_PERFDATA" | sort | grep -v "^$" | tr "\n" " " )"
exit $OUTPUT_EXIT_STATUS