[tech] [committee] Temperature Monitoring in Server Room [repost]

Andrew Williams andrew at ucc.gu.uwa.edu.au
Tue Mar 19 18:16:35 AWST 2019


On 2019-03-19 5:08 PM, Melissa Star wrote:

> So how difficult is it for icinga to call external code in response to events?

As an example, here's some of our icinga config for a single host 
(apc54), which is the smart PDU in rack 54:

# The host definition:
object Host "apc54" {
   import "mwa-apc"
   address = "10.128.0.154"
   vars.parent_switch = "dell-mc-access04"
}


# Service definitions that apply to that host (and the other PDU's):
apply Service "pdu_temp_monitored" {
     import "mwa-service"
     display_name = "Rack Temperature"
     check_command = "check_pdu_temp"
     check_interval = 60s
     retry_interval = 30s
     max_check_attempts = 3
     event_command = "event_pdutemp"
     assign where "apcees" in host.groups
}

apply Service "pdu_currents" {
     import "mwa-service"
     display_name = "PDU plug currents"
     check_command = "check_pdu_currents"
     check_interval = 600s
     retry_interval = 120s
     max_check_attempts = 3
     assign where "apcees" in host.groups
}


# Command definitions, as called in the above service definitions.
# These commands return a couple of lines of data in the Nagios API
# format, with a return code of 0 (OK), 1 (WARNING), 2 (CRITICAL)
# or 3 (UNKNOWN/UNREACHABLE), plus a human readable message and the
# measurement value/s. You can use existing plugins, or write your own.
# This command is one that I wrote, to extract values from the PDU using
# SNMP, and write Nagios API format check results to stdout.

object CheckCommand "check_pdu_temp" {
     import "plugin-check-command"
     command = [ "/usr/local/bin/zeus", "-n" ]

     arguments = {
         "-H" = "$host.name$"
     }
}

object CheckCommand "check_pdu_currents" {
     import "plugin-check-command"
     command = [ "/usr/local/bin/zeus", "-p" ]

     arguments = {
         "-H" = "$host.name$"
     }
}


# The event command, defined in the check_pdutemp service. This is an
# arbitrary executable, and this command definition passes in the
# state (eg CRITICAL or OK), the state_type (SOFT or HARD) and
# the number of times the plugin has been called and returned that state
# since the last time it was 'OK'. It is called every time the service
# changes state, so it's up to the command to interpret the values
# passed and act accordingly.
# For example, an event command might do nothing if the state changes
# from something else to 'OK, restart the web server if it's
# the first check that returned CRITICAL, or reboot the server if it's
# the third. (I don't recommend actually doing that, BTW).

object EventCommand "event_pdutemp" {
     import "plugin-event-command"
     command = "/usr/local/bin/event_pdutemp.py $host.name$ 
$service.state$ $service.state_type$ $service.check_attempt$"
}

Andrew


More information about the tech mailing list