Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 43 additions & 5 deletions cmk/plugins/hp_proliant/agent_based/hp_proliant_da_cntlr.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# conditions defined in the file COPYING, which is part of this source code package.
from collections.abc import Mapping
from enum import StrEnum
from typing import assert_never, NamedTuple, Self
from typing import assert_never, NamedTuple, Self, TypedDict

from cmk.agent_based.v2 import (
CheckPlugin,
Expand Down Expand Up @@ -182,16 +182,45 @@ def from_line(cls, line: list[str]) -> Self | None:
)


class Params(TypedDict):
condition_other_state: int
board_condition_other_state: int
board_status_other_state: int


# Defaults reproduce the historic behaviour (the "other" value maps to WARN).
# HPE ProLiant Gen11 / iLO 6 firmware tends to report the board condition as
# "other" for perfectly healthy controllers, which makes the service WARN
# forever; the check ruleset lets users remap each "other" value independently.
DEFAULT_PARAMETERS: Params = {
"condition_other_state": State.WARN.value,
"board_condition_other_state": State.WARN.value,
"board_status_other_state": State.WARN.value,
}


def _monitoring_state(value: SNMPCondition | SNMPState, other_state: State) -> State:
if value in (SNMPCondition.OTHER, SNMPState.OTHER):
return other_state
return value.to_state()


def parse_hp_proliant_da_cntlr(string_table: StringTable) -> ParsedSection:
return {line[0]: ControllerData.from_line(line) for line in string_table}


def discovery_hp_proliant_da_cntlr(section: ParsedSection) -> DiscoveryResult:
if section:
yield from (Service(item=item) for item in section)
# Skip phantom/placeholder rows (parsed to ``None``): HPE ProLiant Gen11 /
# iLO 6 exposes a controller table row -- typically at index 0 -- whose
# condition / role / board-status / board-condition cells are all "0", a
# value the vendor MIB does not define. Discovering it created a service
# that was permanently UNKNOWN ("Controller not found in SNMP data").
yield from (Service(item=item) for item, data in section.items() if data is not None)


def check_hp_proliant_da_cntlr(item: ControllerID, section: ParsedSection) -> CheckResult:
def check_hp_proliant_da_cntlr(
item: ControllerID, params: Params, section: ParsedSection
) -> CheckResult:
if not (subsection := section.get(item)):
yield Result(state=State.UNKNOWN, summary="Controller not found in SNMP data")
return
Expand All @@ -201,9 +230,16 @@ def check_hp_proliant_da_cntlr(item: ControllerID, section: ParsedSection) -> Ch
"Board-Condition": subsection.b_cond,
"Board-Status": subsection.b_status,
}
other_states: Mapping[str, State] = {
"Condition": State(params["condition_other_state"]),
"Board-Condition": State(params["board_condition_other_state"]),
"Board-Status": State(params["board_status_other_state"]),
}

yield Result(
state=State.worst(*(state.to_state() for state in states.values())),
state=State.worst(
*(_monitoring_state(state, other_states[label]) for label, state in states.items())
),
summary=(
f"{', '.join(f'{label}: {state}' for label, state in states.items())} "
f"(Role: {subsection.role}, Model: {subsection.model}, Slot: {subsection.slot}, "
Expand Down Expand Up @@ -231,4 +267,6 @@ def check_hp_proliant_da_cntlr(item: ControllerID, section: ParsedSection) -> Ch
service_name="HW Controller %s",
discovery_function=discovery_hp_proliant_da_cntlr,
check_function=check_hp_proliant_da_cntlr,
check_default_parameters=DEFAULT_PARAMETERS,
check_ruleset_name="hp_proliant_da_cntlr",
)
10 changes: 9 additions & 1 deletion cmk/plugins/hp_proliant/checkman/hp_proliant_da_cntlr
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,16 @@ description:
This check has been developed with ProLiant DL360 G5 systems but should work
on other HPE Proliant Servers too.

The reported controller condition, board condition and board status can each
take the value {other}, which by default maps to {WARN}. On ProLiant Gen11 /
iLO 6 the board condition is reported as {other} for perfectly healthy
controllers; the check parameters allow remapping the state for the {other}
value of each field independently.

item:
Index of the controller

discovery:
One service is created for each raid controller.
One service is created for each RAID controller. Placeholder rows that report
no valid controller data (all-zero cells, as seen on Gen11 / iLO 6) are
skipped.
59 changes: 59 additions & 0 deletions cmk/plugins/hp_proliant/rulesets/hp_proliant_da_cntlr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#!/usr/bin/env python3
# Copyright (C) 2026 Checkmk GmbH - License: GNU General Public License v2
# This file is part of Checkmk (https://checkmk.com). It is subject to the terms and
# conditions defined in the file COPYING, which is part of this source code package.

from cmk.rulesets.v1 import Help, Title
from cmk.rulesets.v1.form_specs import (
DefaultValue,
DictElement,
Dictionary,
ServiceState,
)
from cmk.rulesets.v1.rule_specs import CheckParameters, HostAndItemCondition, Topic


def _make_form() -> Dictionary:
return Dictionary(
help_text=Help(
"The RAID controllers of HPE ProLiant servers report a condition, a "
"board condition and a board status. Each of these can take the value "
"<i>other</i>, meaning the instrument agent does not recognize the "
"status. HPE ProLiant Gen11 / iLO 6 firmware reports the board "
"condition as <i>other</i> for perfectly healthy controllers, which "
"makes the service WARN permanently. Here you can remap the monitoring "
"state used for the <i>other</i> value of each field independently."
),
elements={
"condition_other_state": DictElement(
required=False,
parameter_form=ServiceState(
title=Title("State when the controller condition is <i>other</i>"),
prefill=DefaultValue(ServiceState.WARN),
),
),
"board_condition_other_state": DictElement(
required=False,
parameter_form=ServiceState(
title=Title("State when the board condition is <i>other</i>"),
prefill=DefaultValue(ServiceState.WARN),
),
),
"board_status_other_state": DictElement(
required=False,
parameter_form=ServiceState(
title=Title("State when the board status is <i>other</i>"),
prefill=DefaultValue(ServiceState.WARN),
),
),
},
)


rule_spec_hp_proliant_da_cntlr = CheckParameters(
name="hp_proliant_da_cntlr",
title=Title("HPE ProLiant RAID controller"),
topic=Topic.STORAGE,
parameter_form=_make_form,
condition=HostAndItemCondition(item_title=Title("Controller index")),
)
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
from cmk.plugins.hp_proliant.agent_based.hp_proliant_da_cntlr import (
check_hp_proliant_da_cntlr,
ControllerID,
DEFAULT_PARAMETERS,
discovery_hp_proliant_da_cntlr,
Params,
parse_hp_proliant_da_cntlr,
)

Expand All @@ -21,13 +23,14 @@


def test_discovery() -> None:
# The all-zero placeholder row ("9") is a phantom controller and must not be
# discovered, otherwise its service is permanently UNKNOWN.
assert list(
discovery_hp_proliant_da_cntlr(section=parse_hp_proliant_da_cntlr(STRING_TABLE))
) == [
Service(item="0"),
Service(item="3"),
Service(item="6"),
Service(item="9"),
]


Expand Down Expand Up @@ -73,7 +76,34 @@ def test_discovery() -> None:
def test_check(item: ControllerID, expected: list[Result]) -> None:
assert (
list(
check_hp_proliant_da_cntlr(item=item, section=parse_hp_proliant_da_cntlr(STRING_TABLE))
check_hp_proliant_da_cntlr(
item=item,
params=DEFAULT_PARAMETERS,
section=parse_hp_proliant_da_cntlr(STRING_TABLE),
)
)
== expected
)


def test_check_remap_other_state() -> None:
# Controller "6" reports Board-Condition = other (WARN by default); on
# Gen11 / iLO 6 this is normal for healthy controllers. Remapping the
# "other" board condition to OK clears the false WARN.
params: Params = {
**DEFAULT_PARAMETERS,
"board_condition_other_state": State.OK.value,
}
assert list(
check_hp_proliant_da_cntlr(
item="6",
params=params,
section=parse_hp_proliant_da_cntlr(STRING_TABLE),
)
) == [
Result(
state=State.OK,
summary="Condition: ok, Board-Condition: other, Board-Status: enabled (Role: other, Model: 1, Slot: 6, Serial: PEYHN0ARCC307J)",
details="The instrument agent does not recognize the status of the controller. You may need to upgrade the instrument agent.",
)
]
Loading