Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 81 additions & 3 deletions osism/tasks/conductor/sonic/config_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@
# VXLAN VTEP name used for VXLAN tunnel configuration
VXLAN_VTEP_NAME = "vtepServ"

# Default listen port of the SONiC telemetry/gNMI container, used for the
# GNMI_ONLY control-plane ACL rule when TELEMETRY|gnmi does not set a port.
DEFAULT_GNMI_PORT = "8080"

# Top-level scaffold keys that the orchestrator and downstream helpers index
# into directly. Kept as a single source of truth so that the production code
# and the test helpers cannot drift apart when keys are added or removed.
Expand Down Expand Up @@ -105,11 +109,11 @@
# difference being only that the generator depends on this one as an input.
# Distinct from inherited tables, which the generator also writes selected
# fields into; the distinction is read-only vs. read-and-update, not the access
# syntax (inherited tables are read via config.get() too). Empty for now; the
# first consumer is the gNMI listen port, read from TELEMETRY. Must stay
# syntax (inherited tables are read via config.get() too). The first (and so
# far only) consumer is the gNMI listen port, read from TELEMETRY. Must stay
# disjoint from OWNED_TABLE_KEYS -- an image-consumed table dropped up front
# would always read back empty.
IMAGE_CONSUMED_TABLE_KEYS = ()
IMAGE_CONSUMED_TABLE_KEYS = ("TELEMETRY",)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change makes the ownership docstring stale, and any replacement that names the members will rot the same way.

Adding TELEMETRY here means the image-consumed tier is no longer empty, but the generate_sonic_config ownership docstring (~line 203) still reads "Empty for now." — now contradicting the code. Rather than swap in "Currently TELEMETRY" (which goes stale the next time a table joins the tier), the docstring should describe the category and leave the membership to this constant, so there's one source of truth: drop "Empty for now." and, if anything, point at IMAGE_CONSUMED_TABLE_KEYS for the current members. (The constant's own comment above has the same problem — "and so far only" is a count that rots — but that's a separate cleanup.)


# Owned tables that are also scaffolded: every scaffold key except the
# inherited ones. The orchestrator setdefault-creates these up front, so
Expand All @@ -126,6 +130,8 @@
# defaults (location "Data Center", contact "info@example.com") even when the
# device has no SNMP data in NetBox.
ON_DEMAND_OWNED_TABLE_KEYS = (
"ACL_RULE",
"ACL_TABLE",
"ROUTE_REDISTRIBUTE",
"SNMP_SERVER",
"SNMP_AGENT_ADDRESS_CONFIG",
Expand Down Expand Up @@ -424,6 +430,8 @@ def generate_sonic_config(device, hwsku, device_as_mapping=None, config_version=
config["MGMT_INTERFACE"][f"eth0|{oob_ip}/{prefix_len}"] = {}
metalbox_ip = _get_metalbox_ip_for_device(device)
config["STATIC_ROUTE"]["mgmt|0.0.0.0/0"] = {"nexthop": metalbox_ip}
# Restrict control-plane services (SNMP, gNMI) to the OOB network
_add_ctrlplane_acls(config, oob_ip, prefix_len)
else:
oob_ip = None

Expand Down Expand Up @@ -2326,3 +2334,73 @@ def _add_snmp_configuration(config, device, oob_ip):
counter += 1

logger.debug(f"Added snmp_server_target {host}")


def _get_gnmi_port(config):
"""Return the gNMI server port for the GNMI_ONLY control-plane ACL rule.

The telemetry/gNMI container reads its listen port from
TELEMETRY|gnmi|port and falls back to 8080 when unset, so the ACL rule
follows the same lookup against the (image-consumed) TELEMETRY table.
"""
Comment on lines +2339 to +2345

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion: Consider validating that the gNMI port is numeric before using it as L4_DST_PORT.

If TELEMETRY|gnmi|port is set to a non-numeric value, L4_DST_PORT will be invalid and caclmgrd may ignore or mis-handle the rule. Please validate that the port is numeric (int or numeric string) and either fall back to DEFAULT_GNMI_PORT or log a warning when it isn’t, so ACL behavior remains predictable under bad TELEMETRY config.

Suggested change
def _get_gnmi_port(config):
"""Return the gNMI server port for the GNMI_ONLY control-plane ACL rule.
The telemetry/gNMI container reads its listen port from
TELEMETRY|gnmi|port and falls back to 8080 when unset, so the ACL rule
follows the same lookup against the (pass-through) TELEMETRY table.
"""
def _get_gnmi_port(config):
"""Return the gNMI server port for the GNMI_ONLY control-plane ACL rule.
The telemetry/gNMI container reads its listen port from
TELEMETRY|gnmi|port and falls back to 8080 when unset, so the ACL rule
follows the same lookup against the (pass-through) TELEMETRY table.
If TELEMETRY|gnmi|port is set to a non-numeric value, fall back to
DEFAULT_GNMI_PORT and log a warning so ACL behavior remains predictable.
"""
gnmi_config = config.get("TELEMETRY", {}).get("gnmi", {})
port_value = gnmi_config.get("port")
# No port configured: use the default.
if port_value is None:
return DEFAULT_GNMI_PORT
# Accept both integer and numeric string configuration.
try:
port_int = int(port_value)
except (TypeError, ValueError):
logger.warning(
"Invalid TELEMETRY|gnmi|port=%r in config; falling back to default gNMI port %s",
port_value,
DEFAULT_GNMI_PORT,
)
return DEFAULT_GNMI_PORT
# Optionally guard against clearly invalid port ranges while keeping behavior predictable.
if not 0 < port_int < 65536:
logger.warning(
"Out-of-range TELEMETRY|gnmi|port=%d in config; falling back to default gNMI port %s",
port_int,
DEFAULT_GNMI_PORT,
)
return DEFAULT_GNMI_PORT
return str(port_int)

gnmi_config = config.get("TELEMETRY", {}).get("gnmi", {})
return str(gnmi_config.get("port", DEFAULT_GNMI_PORT))

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_get_gnmi_port returns "None" for a present-but-null port, and the resulting rule silently leaves gNMI open.

.get("port", DEFAULT_GNMI_PORT) only substitutes the default when the key is absent. With TELEMETRY.gnmi.port: null the key is present with value None, so this returns str(None)"None"; "" stays "", and non-numeric or out-of-range values ("not-a-port", 70000) pass straight through into GNMI_ONLY|RULE_1.L4_DST_PORT (line 2405). caclmgrd can't bind that as a port, so it skips the whole EXTERNAL_CLIENT table and gNMI ends up reachable from everywhere — again with the run reporting success.

That part is just a bug: the value should be validated where it's computed (the rule is present but malformed, so a downstream "is the rule there" check wouldn't catch it). What's a judgement call is the remedy — substitute the default 8080, or fail loudly. Substituting could mismatch the actual service port; failing loudly is only right if gNMI is actually enabled, so a device not running gNMI isn't rejected over an unused port. Which way do you want it?



def _add_ctrlplane_acls(config, oob_ip, prefix_len):
"""Add control-plane ACLs restricting SNMP and gNMI to the OOB network.

Emits ACL_TABLE entries of type CTRLPLANE bound to the SNMP and
EXTERNAL_CLIENT (gNMI/telemetry) caclmgrd services, plus one ACL_RULE
per table accepting only the device's OOB management subnet (the
network-normalised oob_ip/prefix_len). caclmgrd installs an implicit
default-drop for every service bound in a CTRLPLANE table, so sources
outside the OOB subnet can no longer reach SNMP or gNMI. The SSH_ONLY
table (#2329) belongs here as well once implemented.

caclmgrd's EXTERNAL_CLIENT service has no built-in destination port
(verified against sonic-host-services 202211 through master): the rule
must carry it in L4_DST_PORT, otherwise caclmgrd skips the whole table
and the gNMI restriction would silently not exist.

ACL_TABLE and ACL_RULE are generator-owned (ON_DEMAND_OWNED_TABLE_KEYS),
so they are rebuilt from scratch on every regen and stay absent when the
device has no OOB IP. They are also multi-owner
(MULTI_OWNER_OWNED_TABLE_KEYS): the SSH_ONLY table (#2329) belongs here as
well once implemented, so this helper merges only its own SNMP_ONLY /
GNMI_ONLY keys per key rather than rebinding the table wholesale -- the
central owned-table drop in generate_sonic_config clears stale entries up
front, and per-key merge lets coexisting control-plane helpers compose. The
rules are IPv4 (SRC_IP); a non-IPv4 OOB IP logs a warning and emits nothing
rather than failing the whole config generation.
"""
network = ipaddress.ip_network(f"{oob_ip}/{prefix_len}", strict=False)
if network.version != 4:

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The IPv6 path leaves SNMP and gNMI open — is that the intended behaviour?

For a device with an IPv6 OOB address this returns without emitting any SNMP/gNMI CTRLPLANE table, so both services stay reachable from every source while generate_sonic_config reports success. The docstring here (and the matching test at line 100) show this is deliberate — emit nothing rather than raise, to avoid failing config generation. So this isn't a question of whether you considered IPv6, but whether unrestricted is the outcome you want for those devices.

The reason to double-check: the only signal that hardening was skipped is a warning log, which an operator running a sync may not notice — so an IPv6-managed switch can silently end up unhardened. The alternatives are to fail loudly (loud enough that the operator actually sees it) or to implement an IPv6 filter. I'd flagged this fail-open pattern in #2340; that was a reviewer note, not a binding decision, so if you've weighed it and want this behaviour, just say so — it seemed worth surfacing rather than letting it land silently.

logger.warning(
f"OOB IP {oob_ip}/{prefix_len} is not IPv4; skipping control-plane ACLs"
)
return
Comment on lines +2377 to +2382

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue: Guard against invalid OOB IPs raising ValueError during ip_network() construction.

If oob_ip or prefix_len are malformed, ipaddress.ip_network will raise ValueError and abort config generation. Consider wrapping this call in a try/except ValueError, logging a warning (similar to the non-IPv4 case), and returning early so bad input only skips ACL generation rather than breaking the whole config.


accept_from_oob = {
"PRIORITY": "9999",
"PACKET_ACTION": "ACCEPT",
"SRC_IP": str(network),
"IP_TYPE": "IP",
}
config.setdefault("ACL_TABLE", {})
config["ACL_TABLE"]["SNMP_ONLY"] = {
"policy_desc": "SNMP_ONLY",
"type": "CTRLPLANE",
"services": ["SNMP"],
}
config["ACL_TABLE"]["GNMI_ONLY"] = {
"policy_desc": "GNMI_ONLY",
"type": "CTRLPLANE",
"services": ["EXTERNAL_CLIENT"],
}
config.setdefault("ACL_RULE", {})
config["ACL_RULE"]["SNMP_ONLY|RULE_1"] = dict(accept_from_oob)
config["ACL_RULE"]["GNMI_ONLY|RULE_1"] = {
**accept_from_oob,
"L4_DST_PORT": _get_gnmi_port(config),
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
# SPDX-License-Identifier: Apache-2.0

"""Unit tests for ``_add_ctrlplane_acls`` in ``config_generator`` (#2330)."""

from osism.tasks.conductor.sonic.config_generator import (
ON_DEMAND_OWNED_TABLE_KEYS,
TOP_LEVEL_SCAFFOLD_KEYS,
_add_ctrlplane_acls,
)


def test_add_ctrlplane_acls_emits_snmp_and_gnmi_ctrlplane_tables():
config = {}

_add_ctrlplane_acls(config, "10.42.0.5", 24)

assert config["ACL_TABLE"] == {
"SNMP_ONLY": {
"policy_desc": "SNMP_ONLY",
"type": "CTRLPLANE",
"services": ["SNMP"],
},
"GNMI_ONLY": {
"policy_desc": "GNMI_ONLY",
"type": "CTRLPLANE",
"services": ["EXTERNAL_CLIENT"],
},
}


def test_add_ctrlplane_acls_rules_accept_network_normalised_oob_subnet():
"""SRC_IP must carry the network address, not the device's host IP."""
config = {}

_add_ctrlplane_acls(config, "10.42.0.5", 24)

assert config["ACL_RULE"]["SNMP_ONLY|RULE_1"] == {
"PRIORITY": "9999",
"PACKET_ACTION": "ACCEPT",
"SRC_IP": "10.42.0.0/24",
"IP_TYPE": "IP",
}
assert config["ACL_RULE"]["GNMI_ONLY|RULE_1"] == {
"PRIORITY": "9999",
"PACKET_ACTION": "ACCEPT",
"SRC_IP": "10.42.0.0/24",
"IP_TYPE": "IP",
"L4_DST_PORT": "8080",
}


def test_add_ctrlplane_acls_gnmi_rule_requires_dst_port_snmp_does_not():
"""caclmgrd's EXTERNAL_CLIENT service has no built-in destination port;
without L4_DST_PORT in the rule it skips the whole table and the gNMI
restriction would silently not exist. SNMP has a fixed service port
(161) in caclmgrd's ACL_SERVICES, so its rule must not pin one."""
config = {}

_add_ctrlplane_acls(config, "10.42.0.5", 24)

assert config["ACL_RULE"]["GNMI_ONLY|RULE_1"]["L4_DST_PORT"] == "8080"
assert "L4_DST_PORT" not in config["ACL_RULE"]["SNMP_ONLY|RULE_1"]


def test_add_ctrlplane_acls_gnmi_port_follows_telemetry_table():

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add negative-port coverage — this only exercises the two cases that already work.

The test covers a valid integer override (50051) and the absent-key default. Neither touches the inputs that break _get_gnmi_port: port: null, "", "not-a-port", 70000. Once the port-handling decision is made, add those as cases asserting the chosen behaviour (a raise, or substitution of the default). As-is the suite masks the defaulting bug rather than catching it.

"""A TELEMETRY|gnmi|port from the base config wins over the 8080 default
(and non-string values are normalised to the string ConfigDB expects)."""
config = {"TELEMETRY": {"gnmi": {"port": 50051}}}

_add_ctrlplane_acls(config, "10.42.0.5", 24)

assert config["ACL_RULE"]["GNMI_ONLY|RULE_1"]["L4_DST_PORT"] == "50051"


def test_add_ctrlplane_acls_merges_into_co_owned_tables_per_key():
"""ACL_TABLE / ACL_RULE are multi-owner (SSH, SNMP, gNMI): the helper must
merge only its own keys, never rebind the table wholesale, so a sibling
control-plane helper's entries survive. Stale carry-over from a prior regen
is cleared by the central owned-table drop in generate_sonic_config, not
here (see the ownership model and MULTI_OWNER_OWNED_TABLE_KEYS)."""
config = {
"ACL_TABLE": {"SSH_ONLY": {"type": "CTRLPLANE", "services": ["SSH"]}},
"ACL_RULE": {"SSH_ONLY|RULE_1": {"PRIORITY": "9999"}},
}

_add_ctrlplane_acls(config, "10.42.0.5", 24)

# A sibling helper's entries are left untouched ...
assert config["ACL_TABLE"]["SSH_ONLY"] == {"type": "CTRLPLANE", "services": ["SSH"]}
assert config["ACL_RULE"]["SSH_ONLY|RULE_1"] == {"PRIORITY": "9999"}
# ... and this helper's own entries are added alongside them.
assert set(config["ACL_TABLE"]) == {"SSH_ONLY", "SNMP_ONLY", "GNMI_ONLY"}
assert set(config["ACL_RULE"]) == {
"SSH_ONLY|RULE_1",
"SNMP_ONLY|RULE_1",
"GNMI_ONLY|RULE_1",
}


def test_add_ctrlplane_acls_non_ipv4_oob_ip_emits_nothing():

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test documents the IPv6 fail-open as intended — flagging it alongside the helper.

Asserting the tables are absent for an IPv6 OOB address pins the "emit nothing" choice as contract, which is what tells me it's deliberate rather than an oversight. No separate action here: if the helper's IPv6 behaviour changes (see the comment there), this test changes with it; if it stays, this is where the decision is recorded. Just tying the two together.

"""The rules are IPv4-only (SRC_IP); an IPv6 OOB IP must not produce
half-correct tables — and must not break config generation either."""
config = {}

_add_ctrlplane_acls(config, "2001:db8::5", 64)

assert "ACL_TABLE" not in config
assert "ACL_RULE" not in config


def test_ctrlplane_acl_tables_are_on_demand_owned():
"""#2330 requires ACL_TABLE/ACL_RULE to be rebuilt from scratch on every
regen and absent without an OOB IP. Both guarantees hang on the keys
being on-demand owned: owned tables are dropped from the base config up
front, and on-demand (unlike scaffolded) tables are not re-created as
empty dicts."""
for key in ("ACL_TABLE", "ACL_RULE"):
assert key in ON_DEMAND_OWNED_TABLE_KEYS
assert key not in TOP_LEVEL_SCAFFOLD_KEYS
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ def patch(name, **kw):
_add_loopback_configuration=patch("_add_loopback_configuration"),
_add_log_server_configuration=patch("_add_log_server_configuration"),
_add_snmp_configuration=patch("_add_snmp_configuration"),
_add_ctrlplane_acls=patch("_add_ctrlplane_acls"),

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add one unmocked end-to-end test — the wiring tests patch the helper, so nothing proves ACLs reach the output.

These orchestrator tests mock _add_ctrlplane_acls and prove only that it's called with the right args. There's no single test where a base config sets TELEMETRY.gnmi.port, the real generate_sonic_config runs, and the final output is asserted to contain ACL_RULE["GNMI_ONLY|RULE_1"] with that port — i.e. the configured port surviving the central owned-table drop end-to-end. That integration is currently covered only compositionally (helper unit test + drop logic separately), so a regression between them would pass. (No need to re-assert TELEMETRY preservation — the disjointness guard in test_config_generator_ownership.py already covers that; the value here is the port-through-drop flow.)

_add_vrf_configuration=patch("_add_vrf_configuration"),
_add_portchannel_configuration=patch("_add_portchannel_configuration"),
get_cached_device_interfaces=patch(
Expand Down Expand Up @@ -395,6 +396,43 @@ def test_generate_sonic_config_no_oob_ip_leaves_mgmt_empty_and_passes_none(
assert snmp_oob is None


def test_generate_sonic_config_oob_ip_wires_ctrlplane_acls(
mocker, patch_orchestrator_helpers, make_orchestrator_device
):
"""With an OOB IP the orchestrator delegates the control-plane ACLs
(#2330) to ``_add_ctrlplane_acls`` with the raw OOB IP and prefix —
network normalisation is the helper's job."""
patch_base_config(mocker)
patch_orchestrator_helpers.get_device_oob_ip.return_value = ("10.42.0.5", 24)
device = make_orchestrator_device()

config = generate_sonic_config(device, "HWSKU")

patch_orchestrator_helpers._add_ctrlplane_acls.assert_called_once_with(
config, "10.42.0.5", 24
)


def test_generate_sonic_config_no_oob_ip_skips_ctrlplane_acls(
mocker, patch_orchestrator_helpers, make_orchestrator_device
):
"""Without an OOB IP no control-plane ACLs are wired and the owned
ACL_TABLE / ACL_RULE tables stay absent — stale base-config content is
removed by the up-front owned-table drop, not re-created."""
base = make_base_config()
base["ACL_TABLE"] = {"SNMP_ONLY": {"type": "CTRLPLANE"}}
base["ACL_RULE"] = {"SNMP_ONLY|RULE_1": {"PRIORITY": "9999"}}
patch_base_config(mocker, base_config=base)
patch_orchestrator_helpers.get_device_oob_ip.return_value = None
device = make_orchestrator_device()

config = generate_sonic_config(device, "HWSKU")

patch_orchestrator_helpers._add_ctrlplane_acls.assert_not_called()
assert "ACL_TABLE" not in config
assert "ACL_RULE" not in config


# ---------------------------------------------------------------------------
# generate_sonic_config — breakout merge
# ---------------------------------------------------------------------------
Expand Down