Skip to content
Open
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ CHANGELOG
- Replace cfn-hup in compute nodes with systemd timer to support in place updates in order to improve performance for tightly coupled worloads at scale.
This new mechanism relies on shared storage to sync updates between the head node and compute nodes.
- Disable `dnf-makecache.timer` to improve performance for tightly coupled worloads on RHEL/Rocky at scale.
- Support updates of `Tags` during cluster-updates.
- Support updates of `Tags` during cluster-updates except in AWS Top Secret and AWS Secret Regions.
- Add `LaunchTemplateOverrides` to cluster config to allow network interfaces to be customized by overriding the launch template of a compute resource.
- This overrides the parallelcluster default using a shallow merge.
- Add alarm on missing clustermgtd heartbeat.
Expand Down
26 changes: 26 additions & 0 deletions cli/src/pcluster/config/update_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import re
from enum import Enum

from pcluster.aws.common import get_region
from pcluster.config.cluster_config import QueueUpdateStrategy
from pcluster.config.update_policy_utils import SharedStorageChangeInfo
from pcluster.constants import (
Expand Down Expand Up @@ -147,6 +148,12 @@ def is_slurm_queues_change(change):
return any(path.startswith("SlurmQueues[") for path in change.path)


def _is_adc_region():
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a comment / TODO for this commit so that we remember to revert this commit once Tag is supported in us-iso-region

Copy link
Copy Markdown
Contributor

@himani2411 himani2411 May 28, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also lets add a TBR or Some kind of Tag in commit message so that we can track which commit needs to be reverted in future

"""Return True if the currently configured AWS region belongs to an ADC partition."""
region = get_region() or ""
return region.startswith("us-iso")


def extract_type_and_name_from_path(path):
# Example path = 'SlurmQueues[slurm-q-name]'
# This function returns the type and name extracted like this: 'SlurmQueues', 'slurm-q-name'
Expand Down Expand Up @@ -696,6 +703,25 @@ def fail_reason_extra_chef_attributes(change, _) -> str:
+ ". If you need this change, please consider creating a new cluster instead of updating the existing one.",
)

# Update supported everywhere except ADC regions (us-iso*, us-isob*).
#
# In ADC regions there is a CloudFormation behavior where an UpdateStack call that includes
# both a Tags change and a resource whose change is only in Metadata does not update that
# resource. This breaks the head node update flow because cfn-hup on the head node polls
# DescribeStackResource for Metadata changes on HeadNodeLaunchTemplate and never sees them,
# leaving the HeadNodeWaitCondition to time out.
#
# Until the CloudFormation behavior is fixed in ADC, block tag updates in those regions.
UpdatePolicy.SUPPORTED_UNLESS_ADC = UpdatePolicy(
name="SUPPORTED_UNLESS_ADC",
level=1000,
fail_reason=lambda change, patch: (
f"Updating '{change.key}' during a cluster update is not supported in ADC regions."
),
action_needed=lambda change, patch: f"Restore '{change.key}' to its previous value.",
condition_checker=lambda change, patch: not _is_adc_region(),
)

# Block update if cluster has a managed Fsx for Lustre FileSystem, otherwise fallback to QueueUpdateStrategy
UpdatePolicy.MANAGED_FSX = UpdatePolicy(
name="MANAGED_FSX",
Expand Down
6 changes: 5 additions & 1 deletion cli/src/pcluster/schemas/cluster_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -1866,7 +1866,11 @@ class ClusterSchema(BaseSchema):

monitoring = fields.Nested(MonitoringSchema, metadata={"update_policy": UpdatePolicy.IGNORED})
additional_packages = fields.Nested(AdditionalPackagesSchema, metadata={"update_policy": UpdatePolicy.UNSUPPORTED})
tags = fields.Nested(TagSchema, many=True, metadata={"update_policy": UpdatePolicy.SUPPORTED, "update_key": "Key"})
tags = fields.Nested(
TagSchema,
many=True,
metadata={"update_policy": UpdatePolicy.SUPPORTED_UNLESS_ADC, "update_key": "Key"},
)
iam = fields.Nested(ClusterIamSchema, metadata={"update_policy": UpdatePolicy.IGNORED})
directory_service = fields.Nested(
DirectoryServiceSchema, metadata={"update_policy": UpdatePolicy.COMPUTE_AND_LOGIN_NODES_STOP}
Expand Down
2 changes: 1 addition & 1 deletion cli/src/pcluster/schemas/common_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ class TagSchema(BaseSchema):
value = fields.Str(
required=True,
validate=validate.Length(max=256),
metadata={"update_policy": UpdatePolicy.SUPPORTED},
metadata={"update_policy": UpdatePolicy.SUPPORTED_UNLESS_ADC},
)

@post_load
Expand Down
16 changes: 8 additions & 8 deletions cli/tests/pcluster/config/test_config_patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -1154,11 +1154,11 @@ def test_patch_check_cluster_resource_bucket(
"Tags",
None,
{"Key": "test3", "Value": "val3"},
UpdatePolicy.SUPPORTED,
UpdatePolicy.SUPPORTED_UNLESS_ADC,
is_list=True,
)
],
UpdatePolicy.SUPPORTED,
UpdatePolicy.SUPPORTED_UNLESS_ADC,
id="tag_addition",
),
pytest.param(
Expand All @@ -1170,11 +1170,11 @@ def test_patch_check_cluster_resource_bucket(
"Tags",
{"Key": "test2", "Value": "val2"},
None,
UpdatePolicy.SUPPORTED,
UpdatePolicy.SUPPORTED_UNLESS_ADC,
is_list=True,
)
],
UpdatePolicy.SUPPORTED,
UpdatePolicy.SUPPORTED_UNLESS_ADC,
id="tag_removal",
),
pytest.param(
Expand All @@ -1186,11 +1186,11 @@ def test_patch_check_cluster_resource_bucket(
"Value",
"old_value",
"new_value",
UpdatePolicy.SUPPORTED,
UpdatePolicy.SUPPORTED_UNLESS_ADC,
is_list=False,
)
],
UpdatePolicy.SUPPORTED,
UpdatePolicy.SUPPORTED_UNLESS_ADC,
id="tag_value_modification",
),
pytest.param(
Expand All @@ -1202,11 +1202,11 @@ def test_patch_check_cluster_resource_bucket(
"Tags",
None,
{"Key": "test3", "Value": "val3"},
UpdatePolicy.SUPPORTED,
UpdatePolicy.SUPPORTED_UNLESS_ADC,
is_list=True,
)
],
UpdatePolicy.SUPPORTED,
UpdatePolicy.SUPPORTED_UNLESS_ADC,
id="order_change_plus_addition",
),
pytest.param(
Expand Down
41 changes: 41 additions & 0 deletions cli/tests/pcluster/config/test_update_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -3044,3 +3044,44 @@ def test_head_node_local_storage_update_blocked(
if expected_action_needed:
assert_that(action_needed_messages).is_not_empty()
assert_that(action_needed_messages[0]).is_equal_to(expected_action_needed)


@pytest.mark.parametrize(
"region, expected_result",
[
pytest.param("us-east-1", True, id="commercial region allows tag update"),
pytest.param("eu-west-1", True, id="commercial EU region allows tag update"),
pytest.param("us-gov-west-1", True, id="GovCloud region allows tag update"),
pytest.param("cn-north-1", True, id="China region allows tag update"),
pytest.param("us-iso-east-1", False, id="us-iso region blocks tag update"),
pytest.param("us-iso-west-1", False, id="us-iso-west region blocks tag update"),
pytest.param("us-isob-east-1", False, id="us-isob region blocks tag update"),
pytest.param("", True, id="empty region falls back to allowing update"),
],
)
def test_supported_unless_adc_condition_checker(mocker, region, expected_result):
"""SUPPORTED_UNLESS_ADC blocks updates only in ADC partitions (us-iso*, us-isob*)."""
mocker.patch("pcluster.config.update_policy.get_region", return_value=region)

change_mock = mocker.MagicMock()
patch_mock = mocker.MagicMock()

assert_that(UpdatePolicy.SUPPORTED_UNLESS_ADC.condition_checker(change_mock, patch_mock)).is_equal_to(
expected_result
)


def test_supported_unless_adc_fail_reason_mentions_adc(mocker):
"""The fail reason for SUPPORTED_UNLESS_ADC explains the ADC limitation."""
mocker.patch("pcluster.config.update_policy.get_region", return_value="us-iso-east-1")

change_mock = mocker.MagicMock()
change_mock.key = "Tags"
patch_mock = mocker.MagicMock()

result, fail_reason, action_needed, _ = UpdatePolicy.SUPPORTED_UNLESS_ADC.check(change_mock, patch_mock)

assert_that(result).is_equal_to(UpdatePolicy.CheckResult.ACTION_NEEDED)
assert_that(fail_reason).contains("ADC")
assert_that(fail_reason).contains("Tags")
assert_that(action_needed).contains("Tags")
16 changes: 9 additions & 7 deletions tests/integration-tests/configs/isolated_regions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -599,13 +599,15 @@ test-suites:
instances: {{ INSTANCES }}
oss: {{ OSS }}
schedulers: {{ SCHEDULERS }}
proxy:
test_proxy.py::test_proxy:
dimensions:
- regions: {{ REGIONS }}
instances: {{ INSTANCES }}
oss: {{ OSS }}
schedulers: {{ SCHEDULERS }}
# This test has never passed in isolated regions because our proxy instance setup logic in the test needs to be adjusted
# If proxy instance is set up correctly, pcluster should be able to use proxy in isolated regions
# proxy:
# test_proxy.py::test_proxy:
# dimensions:
# - regions: {{ REGIONS }}
# instances: {{ INSTANCES }}
# oss: {{ OSS }}
# schedulers: {{ SCHEDULERS }}
pyxis:
test_pyxis.py::test_pyxis:
dimensions:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,23 @@
)


def _get_global_build_number(config=None, args=None):
"""
Gets the global build number from args or pytest config.
Returns the build number as an int, or 0 if not provided.
"""
global_build_number = 0
if args:
args_dict = vars(args)
global_build_number = args_dict.get("global_build_number", 0)
elif config:
global_build_number = config.getoption("--global-build-number", default=0)
try:
return int(global_build_number)
except (TypeError, ValueError):
return 0


def _get_os_parameters(config=None, args=None):
"""
Gets OS jinja parameters.
Expand All @@ -42,9 +59,17 @@ def _get_os_parameters(config=None, args=None):
available_amis_oss_x86 = _get_available_amis_oss("x86", config=config, args=args)
available_amis_oss_arm = _get_available_amis_oss("arm", config=config, args=args)
result = {"AVAILABLE_AMIS_OSS_X86": available_amis_oss_x86, "AVAILABLE_AMIS_OSS_ARM": available_amis_oss_arm}
today_number = (date.today() - date(2020, 1, 1)).days

_propagate_os_jinja_variables("", result, today_number, SUPPORTED_OSES)
# Use global-build-number as the rotation seed if available and non-zero.
# This allows the OS rotation to advance on every build, enabling full coverage
# when running tests multiple times per day. Falls back to day-based rotation otherwise.
global_build_number = _get_global_build_number(config=config, args=args)
if global_build_number:
rotation_seed = global_build_number
else:
rotation_seed = (date.today() - date(2020, 1, 1)).days

_propagate_os_jinja_variables("", result, rotation_seed, SUPPORTED_OSES)

# DCV doesn't support AL2023. Therefore, the following logic makes sure the DCV jinja parameter is not AL2023
dcv_supported_oses = [
Expand All @@ -55,23 +80,23 @@ def _get_os_parameters(config=None, args=None):
for os in SUPPORTED_OSES
if os not in UNSUPPORTED_OSES_FOR_DCV + UNSUPPORTED_ARM_OSES_FOR_DCV + UNSUPPORTED_OSES_FOR_NON_GPU_DCV
]
_propagate_os_jinja_variables("DCV_", result, today_number, dcv_supported_oses, dcv_supported_arm_oses)
_propagate_os_jinja_variables("DCV_", result, rotation_seed, dcv_supported_oses, dcv_supported_arm_oses)

lustre_supported_oses = [os for os in SUPPORTED_OSES if os not in UNSUPPORTED_OSES_FOR_LUSTRE]
_propagate_os_jinja_variables("LUSTRE_", result, today_number, lustre_supported_oses)
_propagate_os_jinja_variables("LUSTRE_", result, rotation_seed, lustre_supported_oses)

no_rhel_oss = [os for os in SUPPORTED_OSES if "rhel" not in os]
_propagate_os_jinja_variables("NO_RHEL_", result, today_number, no_rhel_oss)
_propagate_os_jinja_variables("NO_RHEL_", result, rotation_seed, no_rhel_oss)

no_rocky_oss = [os for os in SUPPORTED_OSES if "rocky" not in os]
_propagate_os_jinja_variables("NO_ROCKY_", result, today_number, no_rocky_oss)
_propagate_os_jinja_variables("NO_ROCKY_", result, rotation_seed, no_rocky_oss)

rhel_oss = [os for os in SUPPORTED_OSES if "rhel" in os]
_propagate_os_jinja_variables("RHEL_", result, today_number, rhel_oss)
_propagate_os_jinja_variables("RHEL_", result, rotation_seed, rhel_oss)
return result


def _propagate_os_jinja_variables(prefix, result, today_number, supported_x86_oses, supported_arm_oses=None):
def _propagate_os_jinja_variables(prefix, result, rotation_seed, supported_x86_oses, supported_arm_oses=None):
available_amis_oss_x86 = result["AVAILABLE_AMIS_OSS_X86"]
available_amis_oss_arm = result["AVAILABLE_AMIS_OSS_ARM"]
if supported_arm_oses is None:
Expand All @@ -83,8 +108,12 @@ def _propagate_os_jinja_variables(prefix, result, today_number, supported_x86_os
result[f"{prefix}OS_X86"] = available_amis_oss_x86
result[f"{prefix}OS_ARM"] = available_amis_oss_arm
for index in range(len(supported_x86_oses)):
result[f"{prefix}OS_X86_{index}"] = available_amis_oss_x86[(today_number + index) % len(available_amis_oss_x86)]
result[f"{prefix}OS_ARM_{index}"] = available_amis_oss_arm[(today_number + index) % len(available_amis_oss_arm)]
result[f"{prefix}OS_X86_{index}"] = available_amis_oss_x86[
(rotation_seed + index) % len(available_amis_oss_x86)
]
result[f"{prefix}OS_ARM_{index}"] = available_amis_oss_arm[
(rotation_seed + index) % len(available_amis_oss_arm)
]


def _get_instance_type_parameters(): # noqa: C901
Expand Down
30 changes: 24 additions & 6 deletions tests/integration-tests/tests/common/assertions.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,8 @@ def assert_default_user_has_desired_sudo_access(
node_type,
region,
disable_sudo_access_default_user,
retries=6,
retry_delay=15,
):
remote_command_executors = []
logging.info(
Expand All @@ -354,12 +356,28 @@ def assert_default_user_has_desired_sudo_access(

command = "sudo -n cat /etc/sudoers.d/90-cloud-init-users"
for node_index, remote_command_executor in enumerate(remote_command_executors):
result = remote_command_executor.run_remote_command(command, raise_on_error=False, timeout=300)
logging.info(f"Default user in {node_type} number {node_index} and result.failed={result.failed}")
logging.info(f"Default user in {node_type} number {node_index} and result.stdout={result.stdout}")
if disable_sudo_access_default_user:
assert_that(result.stdout).contains("a password is required")
assert_that(result.failed).is_equal_to(disable_sudo_access_default_user)
for attempt in range(retries):
result = remote_command_executor.run_remote_command(command, raise_on_error=False, timeout=300)
logging.info(
f"Default user in {node_type} number {node_index} attempt {attempt} "
f"result.failed={result.failed} result.stdout={result.stdout}"
)
if disable_sudo_access_default_user:
if "a password is required" in result.stdout and result.failed:
logging.info(f"Sudo access correctly disabled on {node_type} number {node_index}")
break
if attempt < retries - 1:
logging.info(
f"Sudo not yet disabled on {node_type} number {node_index}, "
f"retrying in {retry_delay}s (attempt {attempt + 1}/{retries})..."
)
time.sleep(retry_delay)
else:
assert_that(result.stdout).contains("a password is required")
assert_that(result.failed).is_equal_to(True)
else:
assert_that(result.failed).is_equal_to(False)
break


def assert_lambda_vpc_settings_are_correct(stack_name, region, security_group_ids, subnet_ids):
Expand Down
2 changes: 1 addition & 1 deletion tests/integration-tests/tests/common/nccl_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from assertpy import assert_that
from utils import get_instance_info

from tests.performance_tests.common import push_result_to_dynamodb
from tests.common.utils import push_result_to_dynamodb

NCCL_COMMON_DATADIR = pathlib.Path(__file__).parent / "data/nccl/"

Expand Down
35 changes: 35 additions & 0 deletions tests/integration-tests/tests/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,14 @@
import random
import string
import time
import uuid
from importlib.metadata import version as get_package_version

import boto3
from assertpy import assert_that
from botocore.exceptions import ClientError
from framework.framework_constants import METADATA_DEFAULT_REGION, PERFORMANCE_METADATA_TABLE
from framework.metadata_table_manager import MetadataTableManager
from packaging import version as packaging_version
from remote_command_executor import RemoteCommandExecutionError, RemoteCommandExecutor
from retrying import retry
Expand Down Expand Up @@ -644,6 +647,38 @@ def get_capacity_reservation_id(request, instance_type, region, count, os):
return reservations_ids


def push_result_to_dynamodb(name, result, instance, os, mpi_variation=None, num_instances=None):
reporting_region = METADATA_DEFAULT_REGION
logging.info(f"Metadata reporting region {reporting_region}")
# Create the metadata table in case it doesn't exist
MetadataTableManager(reporting_region, PERFORMANCE_METADATA_TABLE).create_metadata_table()
try:
# Create DynamoDB resource
dynamodb = boto3.resource("dynamodb", region_name=reporting_region)
table = dynamodb.Table(PERFORMANCE_METADATA_TABLE)

# Prepare item to be inserted
item = {
"id": str(uuid.uuid4().hex),
"name": name,
"instance": instance,
"os": os,
"timestamp": int(time.time()),
"result": str(result),
"pcluster_version": f"v{get_installed_parallelcluster_version()}",
"mpi_variation": str(mpi_variation),
"num_instances": num_instances,
}

# Put item in the table
table.put_item(Item=item)
logging.info(f"Successfully pushed result to DynamoDB with id: {item['id']}")

except Exception as e:
logging.error(f"Failed to push result to DynamoDB: {str(e)}")
raise


@retry(stop_max_attempt_number=3, wait_fixed=seconds(10))
def _download_and_upload_to_s3(url, bucket_name, s3_key, s3_client):
"""Download a file from a URL and upload it to S3, with retries for transient network failures."""
Expand Down
Loading
Loading