#!/bin/sh # Little Nagios/Icinga/Naemon probe for Linux disks' I/O via /proc/diskstats # # Based on https://www.kernel.org/doc/html/latest/admin-guide/iostats.html # # Published under UnlicenseĀ  (similar to public domain) PROGPATH=$( echo $0 | sed -e 's,[\\/][^\\/][^\\/]*$,,' ) REVISION="0.1" # Stop at the first non-catched error set -e # Include check_range() # Not needed at the moment #. $PROGPATH/utils.sh STATE_OK=0 STATE_WARNING=1 STATE_CRITICAL=2 STATE_UNKNOWN=3 STATE_DEPENDENT=4 # Default values WARNING_STATS=0 CRITICAL_STATS=0 # Output OUTPUT_EXIT_STATUS=$STATE_OK OUTPUT_DETAIL_OK="" OUTPUT_DETAIL_WARNING="" OUTPUT_DETAIL_CRITICAL="" OUTPUT_PERFDATA="" # # Help function # usage() { cat < ] [ -I ] -d [ ... -d ... ] This probe send statistics about disks on a Linux system. At the moment, the "number of IO in progress" (field #9) is the only metric able to generate an alert. Options : -d xx Device name (usually 'sda'). Can use shell pattern like 'sda*' or 'sd?'. -i N Number of io in progress : warning threshold -I N Number of io in progress : critical threshold Example : This example is a good generic configuration for simple and light systems, whether physical or virtual, taking any sda, sdb, vda, vdb, etc. disks found and triggering a warning at 1 IO in progress and an alert at 5. $0 -w 0 -W 5 -d 'sd?' -d 'vd?' EOF } # Some early checks if [ ! -r "/proc/diskstats" ]; then echo "UNKNOWN file '/proc/diskstats' not found or not readable." exit $STATE_UNKNOWN fi # TODO: check Linux kernel version. See differences between 2.4+, 2.6+, 4.18+ # https://www.kernel.org/doc/html/latest/admin-guide/iostats.html # # Loop on parameters + tests # while getopts hd:i:I: OPT; do case "$OPT" in 'd') DEVICE_FOUND=0 while read \ MAJOR MINOR DEVICE_NAME \ READS_COMPLETED READS_MERGED SECTORS_READ TIME_SPENT_READING \ WRITES_COMPLETED WRITES_MERGED SECTORS_WRITTEN TIME_SPENT_WRITING \ IO_IN_PROGRESS \ TIME_SPENT_IO TIME_SPENT_IO_WEIGHTED \ DISCARDS_MERGED SECTORS_DISCARDED TIME_SPENT_DISCARDING \ FLUSH_REQUESTS TIME_SPENT_FLUSHING \ FUTURE_THINGIES do case "$DEVICE_NAME" in $OPTARG) DEVICE_FOUND=1 # Checking the thresholds if [ -n "$THRESHOLD_IO_IN_PROGRESS_CRITICAL" ] && [ "$IO_IN_PROGRESS" -gt "$THRESHOLD_IO_IN_PROGRESS_CRITICAL" ]; then OUTPUT_EXIT_STATUS=$STATE_CRITICAL OUTPUT_DETAIL_CRITICAL="$OUTPUT_DETAIL_CRITICAL I/O in progress on $DEVICE_NAME: $IO_IN_PROGRESS" fi if [ -n "$THRESHOLD_IO_IN_PROGRESS_WARNING" ] && [ "$OUTPUT_EXIT_STATUS" != "$STATE_CRITICAL" ] && [ "$IO_IN_PROGRESS" -gt "$THRESHOLD_IO_IN_PROGRESS_WARNING" ]; then OUTPUT_EXIT_STATUS=$STATE_WARNING OUTPUT_DETAIL_WARNING="$OUTPUT_DETAIL_WARNING I/O in progress on $DEVICE_NAME: $IO_IN_PROGRESS" fi # Generating performance data # reads OUTPUT_PERFDATA="$( printf "%s\n'%s_reads_completed'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$READS_COMPLETED" )" OUTPUT_PERFDATA="$( printf "%s\n'%s_reads_merged'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$READS_MERGED" )" OUTPUT_PERFDATA="$( printf "%s\n'%s_sectors_read'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$SECTORS_READ" )" OUTPUT_PERFDATA="$( printf "%s\n'%s_time_spent_reading'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$TIME_SPENT_READING" )" # writes OUTPUT_PERFDATA="$( printf "%s\n'%s_writes_completed'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$WRITES_COMPLETED" )" OUTPUT_PERFDATA="$( printf "%s\n'%s_writes_merged'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$WRITES_MERGED" )" OUTPUT_PERFDATA="$( printf "%s\n'%s_sectors_written'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$SECTORS_WRITTEN" )" OUTPUT_PERFDATA="$( printf "%s\n'%s_time_spent_writing'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$TIME_SPENT_WRITING" )" # I/O OUTPUT_PERFDATA="$( printf "%s\n'%s_io_in_progress'=%d;%s;%s;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$IO_IN_PROGRESS" "$THRESHOLD_IO_IN_PROGRESS_WARNING" "$THRESHOLD_IO_IN_PROGRESS_CRITICAL" )" OUTPUT_PERFDATA="$( printf "%s\n'%s_time_spent_io'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$TIME_SPENT_IO" )" OUTPUT_PERFDATA="$( printf "%s\n'%s_time_spent_io_weighted'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$TIME_SPENT_IO_WEIGHTED" )" # Discards OUTPUT_PERFDATA="$( printf "%s\n'%s_discards_merged'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$DISCARDS_MERGED" )" OUTPUT_PERFDATA="$( printf "%s\n'%s_sectors_discarded'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$SECTORS_DISCARDED" )" OUTPUT_PERFDATA="$( printf "%s\n'%s_time_spent_discarding'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$TIME_SPENT_DISCARDING" )" # Flushes OUTPUT_PERFDATA="$( printf "%s\n'%s_flush_requests'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$FLUSH_REQUESTS" )" OUTPUT_PERFDATA="$( printf "%s\n'%s_time_spent_flushing'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$TIME_SPENT_FLUSHING" )" ;; esac done < /proc/diskstats # Check if the device requested has been found. if [ "$DEVICE_FOUND" -lt 1 ]; then echo "UNKNOWN device '$OPTARG' not found." exit $STATE_UNKNOWN fi ;; 'h') usage exit 0 ;; 'i') THRESHOLD_IO_IN_PROGRESS_WARNING="$OPTARG" ;; 'I') THRESHOLD_IO_IN_PROGRESS_CRITICAL="$OPTARG" ;; \?) usage exit 1 ;; esac done # Should we check if no device has been requested ? Overkill ? # Final ouput case "$OUTPUT_EXIT_STATUS" in '0') printf "OK%s" "$OUTPUT_DETAIL_OK" ;; '1') printf "WARNING%s" "$OUTPUT_DETAIL_WARNING" ;; '2') printf "CRITICAL%s" "$OUTPUT_DETAIL_CRITICAL" ;; *) printf "UNKNOWN" ;; esac # We sort the perfdata because of a weird bug in some icinga/pnp4nagios interactions printf "|%s\n" "$( printf "%s" "$OUTPUT_PERFDATA" | sort | grep -v "^$" | tr "\n" " " )" exit $OUTPUT_EXIT_STATUS