diff --git a/nagios/check_diskstats.sh b/nagios/check_diskstats.sh new file mode 100755 index 0000000..32273bf --- /dev/null +++ b/nagios/check_diskstats.sh @@ -0,0 +1,155 @@ +#!/bin/sh + +# Little Nagios/Icinga/Naemon probe for Linux disks' I/O via /proc/diskstats +# +# Based on https://www.kernel.org/doc/html/latest/admin-guide/iostats.html +# +# Published under UnlicenseĀ  (similar to public domain) + +PROGPATH=$( echo $0 | sed -e 's,[\\/][^\\/][^\\/]*$,,' ) +REVISION="0.1" + +# Stop at the first non-catched error +set -e + +# Include check_range() +# Not needed at the moment +#. $PROGPATH/utils.sh +STATE_OK=0 +STATE_WARNING=1 +STATE_CRITICAL=2 +STATE_UNKNOWN=3 +STATE_DEPENDENT=4 + +# Default values +WARNING_STATS=0 +CRITICAL_STATS=0 + +# Output +OUTPUT_EXIT_STATUS=$STATE_OK +OUTPUT_DETAIL_OK="" +OUTPUT_DETAIL_WARNING="" +OUTPUT_DETAIL_CRITICAL="" +OUTPUT_PERFDATA="" + +# +# Help function +# +usage() { + cat < ] [ -I ] -d [ ... -d ... ] + +This probe send statistics about disks on a Linux system. +At the moment, the "number of IO in progress" (field #9) is the only metric +able to generate an alert. + +Options : + -d xx Device name (usually 'sda'). Can use shell pattern like 'sda*' or 'sd?'. + -i N Number of io in progress : warning threshold + -I N Number of io in progress : critical threshold + +Example : + This example is a good generic configuration for simple and light systems, + whether physical or virtual, taking any sda, sdb, vda, vdb, etc. disks found + and triggering a warning at 1 IO in progress and an alert at 5. + $0 -w 0 -W 5 -d 'sd?' -d 'vd?' +EOF +} + +# Some early checks +if [ ! -r "/proc/diskstats" ]; then + echo "UNKNOWN file '/proc/diskstats' not found or not readable." + exit $STATE_UNKNOWN +fi +# TODO: check Linux kernel version. See differences between 2.4+, 2.6+, 4.18+ +# https://www.kernel.org/doc/html/latest/admin-guide/iostats.html + +# +# Loop on parameters + tests +# +while getopts hd:i:I: OPT; do + case "$OPT" in + 'd') + while read \ + MAJOR MINOR DEVICE_NAME \ + READS_COMPLETED READS_MERGED SECTORS_READ TIME_SPENT_READING \ + WRITES_COMPLETED WRITES_MERGED SECTORS_WRITTEN TIME_SPENT_WRITING \ + IO_IN_PROGRESS \ + TIME_SPENT_IO TIME_SPENT_IO_WEIGHTED \ + DISCARDS_MERGED SECTORS_DISCARDED TIME_SPENT_DISCARDING \ + FLUSH_REQUESTS TIME_SPENT_FLUSHING \ + FUTURE_THINGIES + do + case "$DEVICE_NAME" in + $OPTARG) + # Checking the thresholds + if [ -n "$THRESHOLD_IO_IN_PROGRESS_CRITICAL" ] && [ "$IO_IN_PROGRESS" -gt "$THRESHOLD_IO_IN_PROGRESS_CRITICAL" ]; then + OUTPUT_EXIT_STATUS=$STATE_CRITICAL + OUTPUT_DETAIL_CRITICAL="$OUTPUT_DETAIL_CRITICAL I/O in progress on $DEVICE_NAME: $IO_IN_PROGRESS" + fi + if [ -n "$THRESHOLD_IO_IN_PROGRESS_WARNING" ] && [ "$OUTPUT_EXIT_STATUS" != "$STATE_CRITICAL" ] && [ "$IO_IN_PROGRESS" -gt "$THRESHOLD_IO_IN_PROGRESS_WARNING" ]; then + OUTPUT_EXIT_STATUS=$STATE_WARNING + OUTPUT_DETAIL_WARNING="$OUTPUT_DETAIL_WARNING I/O in progress on $DEVICE_NAME: $IO_IN_PROGRESS" + fi + # Generating performance data + # reads + OUTPUT_PERFDATA="$( printf "%s\n'%s_reads_completed'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$READS_COMPLETED" )" + OUTPUT_PERFDATA="$( printf "%s\n'%s_reads_merged'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$READS_MERGED" )" + OUTPUT_PERFDATA="$( printf "%s\n'%s_sectors_read'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$SECTORS_READ" )" + OUTPUT_PERFDATA="$( printf "%s\n'%s_time_spent_reading'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$TIME_SPENT_READING" )" + # writes + OUTPUT_PERFDATA="$( printf "%s\n'%s_writes_completed'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$WRITES_COMPLETED" )" + OUTPUT_PERFDATA="$( printf "%s\n'%s_writes_merged'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$WRITES_MERGED" )" + OUTPUT_PERFDATA="$( printf "%s\n'%s_sectors_written'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$SECTORS_WRITTEN" )" + OUTPUT_PERFDATA="$( printf "%s\n'%s_time_spent_writing'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$TIME_SPENT_WRITING" )" + # I/O + OUTPUT_PERFDATA="$( printf "%s\n'%s_io_in_progress'=%d;%s;%s;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$IO_IN_PROGRESS" "$THRESHOLD_IO_IN_PROGRESS_WARNING" "$THRESHOLD_IO_IN_PROGRESS_CRITICAL" )" + OUTPUT_PERFDATA="$( printf "%s\n'%s_time_spent_io'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$TIME_SPENT_IO" )" + OUTPUT_PERFDATA="$( printf "%s\n'%s_time_spent_io_weighted'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$TIME_SPENT_IO_WEIGHTED" )" + # Discards + OUTPUT_PERFDATA="$( printf "%s\n'%s_discards_merged'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$DISCARDS_MERGED" )" + OUTPUT_PERFDATA="$( printf "%s\n'%s_sectors_discarded'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$SECTORS_DISCARDED" )" + OUTPUT_PERFDATA="$( printf "%s\n'%s_time_spent_discarding'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$TIME_SPENT_DISCARDING" )" + # Flushes + OUTPUT_PERFDATA="$( printf "%s\n'%s_flush_requests'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$FLUSH_REQUESTS" )" + OUTPUT_PERFDATA="$( printf "%s\n'%s_time_spent_flushing'=%dc;;;0" "$OUTPUT_PERFDATA" "$DEVICE_NAME" "$TIME_SPENT_FLUSHING" )" + ;; + esac + done < /proc/diskstats + ;; + + 'i') + THRESHOLD_IO_IN_PROGRESS_WARNING="$OPTARG" + ;; + + 'I') + THRESHOLD_IO_IN_PROGRESS_CRITICAL="$OPTARG" + ;; + + \?) + usage + exit 1 + ;; + esac +done + +# Final ouput +case "$OUTPUT_EXIT_STATUS" in + '0') + printf "OK%s" "$OUTPUT_DETAIL_OK" + ;; + '1') + printf "WARNING%s" "$OUTPUT_DETAIL_WARNING" + ;; + '2') + printf "CRITICAL%s" "$OUTPUT_DETAIL_CRITICAL" + ;; + *) + printf "UNKNOWN" + ;; +esac + +# We sort the perfdata because of a weird bug in some icinga/pnp4nagios interactions +printf "|%s\n" "$( printf "%s" "$OUTPUT_PERFDATA" | sort | grep -v "^$" | tr "\n" " " )" +exit $OUTPUT_EXIT_STATUS diff --git a/nagios/etc/30_nrpe-basic.cfg b/nagios/etc/30_nrpe-basic.cfg index 9f0dcfd..03f53a2 100644 --- a/nagios/etc/30_nrpe-basic.cfg +++ b/nagios/etc/30_nrpe-basic.cfg @@ -1,5 +1,6 @@ # Commandes de base pour serveur Linux command[check_disks]=/usr/lib/nagios/plugins/check_disk -f -w 10% -c 5% -W 50% -K 5% -l -X tmpfs -X devpts -X devtmpfs -X usbfs -X nsfs -X overlay -X tracefs +command[check_diskstats]=/usr/local/share/scripts-admin/nagios/check_diskstats.sh -i 0 -I 5 -d 'sd?' -d 'vd?' command[check_load]=/usr/lib/nagios/plugins/check_load -w 1,1,1 -c 3,2,2 command[check_network_volume]=/usr/local/share/scripts-admin/nagios/check_network_volume.sh command[check_swaping]=/usr/local/share/scripts-admin/nagios/check_swaping.sh