[tech] [committee] Temperature Monitoring in Server Room [repost]
Andrew Williams
andrew at ucc.gu.uwa.edu.au
Tue Mar 19 18:16:35 AWST 2019
On 2019-03-19 5:08 PM, Melissa Star wrote:
> So how difficult is it for icinga to call external code in response to events?
As an example, here's some of our icinga config for a single host
(apc54), which is the smart PDU in rack 54:
# The host definition:
object Host "apc54" {
import "mwa-apc"
address = "10.128.0.154"
vars.parent_switch = "dell-mc-access04"
}
# Service definitions that apply to that host (and the other PDU's):
apply Service "pdu_temp_monitored" {
import "mwa-service"
display_name = "Rack Temperature"
check_command = "check_pdu_temp"
check_interval = 60s
retry_interval = 30s
max_check_attempts = 3
event_command = "event_pdutemp"
assign where "apcees" in host.groups
}
apply Service "pdu_currents" {
import "mwa-service"
display_name = "PDU plug currents"
check_command = "check_pdu_currents"
check_interval = 600s
retry_interval = 120s
max_check_attempts = 3
assign where "apcees" in host.groups
}
# Command definitions, as called in the above service definitions.
# These commands return a couple of lines of data in the Nagios API
# format, with a return code of 0 (OK), 1 (WARNING), 2 (CRITICAL)
# or 3 (UNKNOWN/UNREACHABLE), plus a human readable message and the
# measurement value/s. You can use existing plugins, or write your own.
# This command is one that I wrote, to extract values from the PDU using
# SNMP, and write Nagios API format check results to stdout.
object CheckCommand "check_pdu_temp" {
import "plugin-check-command"
command = [ "/usr/local/bin/zeus", "-n" ]
arguments = {
"-H" = "$host.name$"
}
}
object CheckCommand "check_pdu_currents" {
import "plugin-check-command"
command = [ "/usr/local/bin/zeus", "-p" ]
arguments = {
"-H" = "$host.name$"
}
}
# The event command, defined in the check_pdutemp service. This is an
# arbitrary executable, and this command definition passes in the
# state (eg CRITICAL or OK), the state_type (SOFT or HARD) and
# the number of times the plugin has been called and returned that state
# since the last time it was 'OK'. It is called every time the service
# changes state, so it's up to the command to interpret the values
# passed and act accordingly.
# For example, an event command might do nothing if the state changes
# from something else to 'OK, restart the web server if it's
# the first check that returned CRITICAL, or reboot the server if it's
# the third. (I don't recommend actually doing that, BTW).
object EventCommand "event_pdutemp" {
import "plugin-event-command"
command = "/usr/local/bin/event_pdutemp.py $host.name$
$service.state$ $service.state_type$ $service.check_attempt$"
}
Andrew
More information about the tech
mailing list