From 9f2dbeec7714c1836a2f23f1b1e9fb84ed7253ec Mon Sep 17 00:00:00 2001 From: hanwenli Date: Tue, 24 Mar 2026 08:39:31 -0700 Subject: [PATCH 1/9] [integ-tests] Skip test_proxy in isolated_regions.yaml --- .../configs/isolated_regions.yaml | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/tests/integration-tests/configs/isolated_regions.yaml b/tests/integration-tests/configs/isolated_regions.yaml index e2bd946ae0..15a594bf62 100644 --- a/tests/integration-tests/configs/isolated_regions.yaml +++ b/tests/integration-tests/configs/isolated_regions.yaml @@ -599,13 +599,15 @@ test-suites: instances: {{ INSTANCES }} oss: {{ OSS }} schedulers: {{ SCHEDULERS }} - proxy: - test_proxy.py::test_proxy: - dimensions: - - regions: {{ REGIONS }} - instances: {{ INSTANCES }} - oss: {{ OSS }} - schedulers: {{ SCHEDULERS }} +# This test has never passed in isolated regions because our proxy instance setup logic in the test needs to be adjusted +# If proxy instance is set up correctly, pcluster should be able to use proxy in isolated regions +# proxy: +# test_proxy.py::test_proxy: +# dimensions: +# - regions: {{ REGIONS }} +# instances: {{ INSTANCES }} +# oss: {{ OSS }} +# schedulers: {{ SCHEDULERS }} pyxis: test_pyxis.py::test_pyxis: dimensions: From 68d3ec0788346cf146494876c7238f42e2c10803 Mon Sep 17 00:00:00 2001 From: hanwenli Date: Tue, 14 Apr 2026 11:28:01 -0700 Subject: [PATCH 2/9] [integ-tests] Skip root volume tags check in us-iso regions The tags are not propagated to root volumes in us-iso regions --- tests/integration-tests/tests/tags/test_tag_propagation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration-tests/tests/tags/test_tag_propagation.py b/tests/integration-tests/tests/tags/test_tag_propagation.py index 290f8ec186..e49a4d99a5 100644 --- a/tests/integration-tests/tests/tags/test_tag_propagation.py +++ b/tests/integration-tests/tests/tags/test_tag_propagation.py @@ -127,6 +127,7 @@ def _check_tag_propagation( {"Name": "HeadNode", "parallelcluster:node-type": "HeadNode"}, ), "tag_getter_kwargs": {"cluster": cluster, "os": os}, + "skip": "us-iso" in cluster.region, }, { "resource": "Compute Node", From 3c80ad5fded5ced769ab3b47ea70738bd7dd4fd9 Mon Sep 17 00:00:00 2001 From: hanwenli Date: Tue, 14 Apr 2026 10:50:48 -0700 Subject: [PATCH 3/9] [integ-tests-framework] Move push_result_to_dynamodb to tests/common/utils The SSM automation test pipeline deletes tests/performance_tests before running tests, breaking nccl_common.py's top-level import from that package. Moreover, The function is a generic DynamoDB reporting utility with no performance-test-specific logic, so tests/common/utils is a more appropriate home regardless. --- .../tests/common/nccl_common.py | 2 +- tests/integration-tests/tests/common/utils.py | 35 +++++++++++++++++ .../tests/performance_tests/common.py | 38 +------------------ .../tests/performance_tests/test_openfoam.py | 3 +- .../tests/performance_tests/test_osu.py | 2 +- .../tests/performance_tests/test_starccm.py | 9 ++++- 6 files changed, 47 insertions(+), 42 deletions(-) diff --git a/tests/integration-tests/tests/common/nccl_common.py b/tests/integration-tests/tests/common/nccl_common.py index 355ab9cf08..f2d7a32bc6 100644 --- a/tests/integration-tests/tests/common/nccl_common.py +++ b/tests/integration-tests/tests/common/nccl_common.py @@ -17,7 +17,7 @@ from assertpy import assert_that from utils import get_instance_info -from tests.performance_tests.common import push_result_to_dynamodb +from tests.common.utils import push_result_to_dynamodb NCCL_COMMON_DATADIR = pathlib.Path(__file__).parent / "data/nccl/" diff --git a/tests/integration-tests/tests/common/utils.py b/tests/integration-tests/tests/common/utils.py index be8762aa54..877a6ac068 100644 --- a/tests/integration-tests/tests/common/utils.py +++ b/tests/integration-tests/tests/common/utils.py @@ -15,11 +15,14 @@ import random import string import time +import uuid from importlib.metadata import version as get_package_version import boto3 from assertpy import assert_that from botocore.exceptions import ClientError +from framework.framework_constants import METADATA_DEFAULT_REGION, PERFORMANCE_METADATA_TABLE +from framework.metadata_table_manager import MetadataTableManager from packaging import version as packaging_version from remote_command_executor import RemoteCommandExecutionError, RemoteCommandExecutor from retrying import retry @@ -644,6 +647,38 @@ def get_capacity_reservation_id(request, instance_type, region, count, os): return reservations_ids +def push_result_to_dynamodb(name, result, instance, os, mpi_variation=None, num_instances=None): + reporting_region = METADATA_DEFAULT_REGION + logging.info(f"Metadata reporting region {reporting_region}") + # Create the metadata table in case it doesn't exist + MetadataTableManager(reporting_region, PERFORMANCE_METADATA_TABLE).create_metadata_table() + try: + # Create DynamoDB resource + dynamodb = boto3.resource("dynamodb", region_name=reporting_region) + table = dynamodb.Table(PERFORMANCE_METADATA_TABLE) + + # Prepare item to be inserted + item = { + "id": str(uuid.uuid4().hex), + "name": name, + "instance": instance, + "os": os, + "timestamp": int(time.time()), + "result": str(result), + "pcluster_version": f"v{get_installed_parallelcluster_version()}", + "mpi_variation": str(mpi_variation), + "num_instances": num_instances, + } + + # Put item in the table + table.put_item(Item=item) + logging.info(f"Successfully pushed result to DynamoDB with id: {item['id']}") + + except Exception as e: + logging.error(f"Failed to push result to DynamoDB: {str(e)}") + raise + + @retry(stop_max_attempt_number=3, wait_fixed=seconds(10)) def _download_and_upload_to_s3(url, bucket_name, s3_key, s3_client): """Download a file from a URL and upload it to S3, with retries for transient network failures.""" diff --git a/tests/integration-tests/tests/performance_tests/common.py b/tests/integration-tests/tests/performance_tests/common.py index 3441d0f940..ac7384875f 100644 --- a/tests/integration-tests/tests/performance_tests/common.py +++ b/tests/integration-tests/tests/performance_tests/common.py @@ -3,20 +3,16 @@ import os as os_lib import pathlib import shutil -import time -import uuid from math import ceil from os import makedirs import boto3 from assertpy import assert_that -from framework.framework_constants import METADATA_DEFAULT_REGION, PERFORMANCE_METADATA_TABLE -from framework.metadata_table_manager import MetadataTableManager from retrying import retry from time_utils import seconds from utils import get_username_for_os -from tests.common.utils import fetch_instance_slots, get_installed_parallelcluster_version +from tests.common.utils import fetch_instance_slots # Common settings used by all test scenarios # You can change these variables to adapt the performance test to your needs. @@ -253,35 +249,3 @@ def _log_output_performance_difference(node, performance_degradation, observed_v f"Nodes: {node}, Baseline: {baseline_value} seconds, Observed: {observed_value} seconds, " f"Percentage difference: {percentage_difference}%, Outcome: {outcome}" ) - - -def push_result_to_dynamodb(name, result, instance, os, mpi_variation=None, num_instances=None): - reporting_region = METADATA_DEFAULT_REGION - logging.info(f"Metadata reporting region {reporting_region}") - # Create the metadata table in case it doesn't exist - MetadataTableManager(reporting_region, PERFORMANCE_METADATA_TABLE).create_metadata_table() - try: - # Create DynamoDB resource - dynamodb = boto3.resource("dynamodb", region_name=reporting_region) - table = dynamodb.Table(PERFORMANCE_METADATA_TABLE) - - # Prepare item to be inserted - item = { - "id": str(uuid.uuid4().hex), - "name": name, - "instance": instance, - "os": os, - "timestamp": int(time.time()), - "result": str(result), - "pcluster_version": f"v{get_installed_parallelcluster_version()}", - "mpi_variation": str(mpi_variation), - "num_instances": num_instances, - } - - # Put item in the table - table.put_item(Item=item) - logging.info(f"Successfully pushed result to DynamoDB with id: {item['id']}") - - except Exception as e: - logging.error(f"Failed to push result to DynamoDB: {str(e)}") - raise diff --git a/tests/integration-tests/tests/performance_tests/test_openfoam.py b/tests/integration-tests/tests/performance_tests/test_openfoam.py index 811b585c28..0a7fa2ce3d 100644 --- a/tests/integration-tests/tests/performance_tests/test_openfoam.py +++ b/tests/integration-tests/tests/performance_tests/test_openfoam.py @@ -4,7 +4,8 @@ import pytest from remote_command_executor import RemoteCommandExecutionError, RemoteCommandExecutor -from tests.performance_tests.common import _log_output_performance_difference, push_result_to_dynamodb +from tests.common.utils import push_result_to_dynamodb +from tests.performance_tests.common import _log_output_performance_difference # timeout in seconds OPENFOAM_INSTALLATION_TIMEOUT = 300 diff --git a/tests/integration-tests/tests/performance_tests/test_osu.py b/tests/integration-tests/tests/performance_tests/test_osu.py index b996831c10..ec7d802187 100644 --- a/tests/integration-tests/tests/performance_tests/test_osu.py +++ b/tests/integration-tests/tests/performance_tests/test_osu.py @@ -22,10 +22,10 @@ from tests.common.utils import ( fetch_instance_slots, get_capacity_reservation_id, + push_result_to_dynamodb, run_system_analyzer, write_file, ) -from tests.performance_tests.common import push_result_to_dynamodb # We collected OSU benchmarks results for these instance types OSU_BENCHMARKS_INSTANCES = ["c5n.18xlarge", "p5en.48xlarge", "p6-b200.48xlarge"] diff --git a/tests/integration-tests/tests/performance_tests/test_starccm.py b/tests/integration-tests/tests/performance_tests/test_starccm.py index 77f8d97ed5..1935a3cf55 100644 --- a/tests/integration-tests/tests/performance_tests/test_starccm.py +++ b/tests/integration-tests/tests/performance_tests/test_starccm.py @@ -5,8 +5,13 @@ import pytest from remote_command_executor import RemoteCommandExecutionError, RemoteCommandExecutor -from tests.common.utils import assert_no_file_handler_leak, fetch_instance_slots, get_compute_ip_to_num_files -from tests.performance_tests.common import _log_output_performance_difference, push_result_to_dynamodb +from tests.common.utils import ( + assert_no_file_handler_leak, + fetch_instance_slots, + get_compute_ip_to_num_files, + push_result_to_dynamodb, +) +from tests.performance_tests.common import _log_output_performance_difference # timeout in seconds STARCCM_INSTALLATION_TIMEOUT = 1800 From d9f35934b19941d9411cf66ae8eb767b21b53b52 Mon Sep 17 00:00:00 2001 From: hanwenli Date: Mon, 4 May 2026 08:33:49 -0700 Subject: [PATCH 4/9] Block Tags updates on cluster update in ADC regions In us-iso*, us-isob* regions, CloudFormation has a behavior where an UpdateStack call that includes both a Tags change and a resource whose change is only in Metadata does not update that resource. This breaks the head node update flow: cfn-hup on the head node polls DescribeStackResource for Metadata changes on HeadNodeLaunchTemplate, never sees them when tags are updated in the same call, and the HeadNodeWaitCondition times out after 30 minutes. Until the CloudFormation behavior is addressed, block tag updates at the validation layer in ADC regions so the failure mode surfaces immediately. Commercial, GovCloud, and China regions are unaffected. Policy changes: - Add UpdatePolicy.SUPPORTED_UNLESS_ADC, equivalent to SUPPORTED outside ADC and UNSUPPORTED inside ADC. - Cover the new policy with parametrized unit tests across commercial, GovCloud, China, us-iso, us-isob, and empty region values. --- CHANGELOG.md | 2 +- cli/src/pcluster/config/update_policy.py | 26 ++++++++++++ cli/src/pcluster/schemas/cluster_schema.py | 6 ++- cli/src/pcluster/schemas/common_schema.py | 2 +- .../pcluster/config/test_config_patch.py | 16 ++++---- .../pcluster/config/test_update_policy.py | 41 +++++++++++++++++++ .../tests/tags/test_tag_propagation.py | 13 +++++- .../pcluster.config.update.queue_update.yaml | 13 ++++++ .../pcluster.config.update.yaml | 13 ++++++ 9 files changed, 119 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 624dade051..db12315fd9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,7 +32,7 @@ CHANGELOG - Replace cfn-hup in compute nodes with systemd timer to support in place updates in order to improve performance for tightly coupled worloads at scale. This new mechanism relies on shared storage to sync updates between the head node and compute nodes. - Disable `dnf-makecache.timer` to improve performance for tightly coupled worloads on RHEL/Rocky at scale. -- Support updates of `Tags` during cluster-updates. +- Support updates of `Tags` during cluster-updates except in AWS Top Secret and AWS Secret Regions. - Add `LaunchTemplateOverrides` to cluster config to allow network interfaces to be customized by overriding the launch template of a compute resource. - This overrides the parallelcluster default using a shallow merge. - Add alarm on missing clustermgtd heartbeat. diff --git a/cli/src/pcluster/config/update_policy.py b/cli/src/pcluster/config/update_policy.py index 4ff16a58e9..cd7afd94ea 100644 --- a/cli/src/pcluster/config/update_policy.py +++ b/cli/src/pcluster/config/update_policy.py @@ -11,6 +11,7 @@ import re from enum import Enum +from pcluster.aws.common import get_region from pcluster.config.cluster_config import QueueUpdateStrategy from pcluster.config.update_policy_utils import SharedStorageChangeInfo from pcluster.constants import ( @@ -147,6 +148,12 @@ def is_slurm_queues_change(change): return any(path.startswith("SlurmQueues[") for path in change.path) +def _is_adc_region(): + """Return True if the currently configured AWS region belongs to an ADC partition.""" + region = get_region() or "" + return region.startswith("us-iso") + + def extract_type_and_name_from_path(path): # Example path = 'SlurmQueues[slurm-q-name]' # This function returns the type and name extracted like this: 'SlurmQueues', 'slurm-q-name' @@ -696,6 +703,25 @@ def fail_reason_extra_chef_attributes(change, _) -> str: + ". If you need this change, please consider creating a new cluster instead of updating the existing one.", ) +# Update supported everywhere except ADC regions (us-iso*, us-isob*). +# +# In ADC regions there is a CloudFormation behavior where an UpdateStack call that includes +# both a Tags change and a resource whose change is only in Metadata does not update that +# resource. This breaks the head node update flow because cfn-hup on the head node polls +# DescribeStackResource for Metadata changes on HeadNodeLaunchTemplate and never sees them, +# leaving the HeadNodeWaitCondition to time out. +# +# Until the CloudFormation behavior is fixed in ADC, block tag updates in those regions. +UpdatePolicy.SUPPORTED_UNLESS_ADC = UpdatePolicy( + name="SUPPORTED_UNLESS_ADC", + level=1000, + fail_reason=lambda change, patch: ( + f"Updating '{change.key}' during a cluster update is not supported in ADC regions." + ), + action_needed=lambda change, patch: f"Restore '{change.key}' to its previous value.", + condition_checker=lambda change, patch: not _is_adc_region(), +) + # Block update if cluster has a managed Fsx for Lustre FileSystem, otherwise fallback to QueueUpdateStrategy UpdatePolicy.MANAGED_FSX = UpdatePolicy( name="MANAGED_FSX", diff --git a/cli/src/pcluster/schemas/cluster_schema.py b/cli/src/pcluster/schemas/cluster_schema.py index 601535ec8b..35762f002e 100644 --- a/cli/src/pcluster/schemas/cluster_schema.py +++ b/cli/src/pcluster/schemas/cluster_schema.py @@ -1866,7 +1866,11 @@ class ClusterSchema(BaseSchema): monitoring = fields.Nested(MonitoringSchema, metadata={"update_policy": UpdatePolicy.IGNORED}) additional_packages = fields.Nested(AdditionalPackagesSchema, metadata={"update_policy": UpdatePolicy.UNSUPPORTED}) - tags = fields.Nested(TagSchema, many=True, metadata={"update_policy": UpdatePolicy.SUPPORTED, "update_key": "Key"}) + tags = fields.Nested( + TagSchema, + many=True, + metadata={"update_policy": UpdatePolicy.SUPPORTED_UNLESS_ADC, "update_key": "Key"}, + ) iam = fields.Nested(ClusterIamSchema, metadata={"update_policy": UpdatePolicy.IGNORED}) directory_service = fields.Nested( DirectoryServiceSchema, metadata={"update_policy": UpdatePolicy.COMPUTE_AND_LOGIN_NODES_STOP} diff --git a/cli/src/pcluster/schemas/common_schema.py b/cli/src/pcluster/schemas/common_schema.py index 32d6bf1146..da2030f244 100644 --- a/cli/src/pcluster/schemas/common_schema.py +++ b/cli/src/pcluster/schemas/common_schema.py @@ -183,7 +183,7 @@ class TagSchema(BaseSchema): value = fields.Str( required=True, validate=validate.Length(max=256), - metadata={"update_policy": UpdatePolicy.SUPPORTED}, + metadata={"update_policy": UpdatePolicy.SUPPORTED_UNLESS_ADC}, ) @post_load diff --git a/cli/tests/pcluster/config/test_config_patch.py b/cli/tests/pcluster/config/test_config_patch.py index 2a89c314cc..d38c97eba1 100644 --- a/cli/tests/pcluster/config/test_config_patch.py +++ b/cli/tests/pcluster/config/test_config_patch.py @@ -1154,11 +1154,11 @@ def test_patch_check_cluster_resource_bucket( "Tags", None, {"Key": "test3", "Value": "val3"}, - UpdatePolicy.SUPPORTED, + UpdatePolicy.SUPPORTED_UNLESS_ADC, is_list=True, ) ], - UpdatePolicy.SUPPORTED, + UpdatePolicy.SUPPORTED_UNLESS_ADC, id="tag_addition", ), pytest.param( @@ -1170,11 +1170,11 @@ def test_patch_check_cluster_resource_bucket( "Tags", {"Key": "test2", "Value": "val2"}, None, - UpdatePolicy.SUPPORTED, + UpdatePolicy.SUPPORTED_UNLESS_ADC, is_list=True, ) ], - UpdatePolicy.SUPPORTED, + UpdatePolicy.SUPPORTED_UNLESS_ADC, id="tag_removal", ), pytest.param( @@ -1186,11 +1186,11 @@ def test_patch_check_cluster_resource_bucket( "Value", "old_value", "new_value", - UpdatePolicy.SUPPORTED, + UpdatePolicy.SUPPORTED_UNLESS_ADC, is_list=False, ) ], - UpdatePolicy.SUPPORTED, + UpdatePolicy.SUPPORTED_UNLESS_ADC, id="tag_value_modification", ), pytest.param( @@ -1202,11 +1202,11 @@ def test_patch_check_cluster_resource_bucket( "Tags", None, {"Key": "test3", "Value": "val3"}, - UpdatePolicy.SUPPORTED, + UpdatePolicy.SUPPORTED_UNLESS_ADC, is_list=True, ) ], - UpdatePolicy.SUPPORTED, + UpdatePolicy.SUPPORTED_UNLESS_ADC, id="order_change_plus_addition", ), pytest.param( diff --git a/cli/tests/pcluster/config/test_update_policy.py b/cli/tests/pcluster/config/test_update_policy.py index b88d9d0668..9e47e3b357 100644 --- a/cli/tests/pcluster/config/test_update_policy.py +++ b/cli/tests/pcluster/config/test_update_policy.py @@ -3044,3 +3044,44 @@ def test_head_node_local_storage_update_blocked( if expected_action_needed: assert_that(action_needed_messages).is_not_empty() assert_that(action_needed_messages[0]).is_equal_to(expected_action_needed) + + +@pytest.mark.parametrize( + "region, expected_result", + [ + pytest.param("us-east-1", True, id="commercial region allows tag update"), + pytest.param("eu-west-1", True, id="commercial EU region allows tag update"), + pytest.param("us-gov-west-1", True, id="GovCloud region allows tag update"), + pytest.param("cn-north-1", True, id="China region allows tag update"), + pytest.param("us-iso-east-1", False, id="us-iso region blocks tag update"), + pytest.param("us-iso-west-1", False, id="us-iso-west region blocks tag update"), + pytest.param("us-isob-east-1", False, id="us-isob region blocks tag update"), + pytest.param("", True, id="empty region falls back to allowing update"), + ], +) +def test_supported_unless_adc_condition_checker(mocker, region, expected_result): + """SUPPORTED_UNLESS_ADC blocks updates only in ADC partitions (us-iso*, us-isob*).""" + mocker.patch("pcluster.config.update_policy.get_region", return_value=region) + + change_mock = mocker.MagicMock() + patch_mock = mocker.MagicMock() + + assert_that(UpdatePolicy.SUPPORTED_UNLESS_ADC.condition_checker(change_mock, patch_mock)).is_equal_to( + expected_result + ) + + +def test_supported_unless_adc_fail_reason_mentions_adc(mocker): + """The fail reason for SUPPORTED_UNLESS_ADC explains the ADC limitation.""" + mocker.patch("pcluster.config.update_policy.get_region", return_value="us-iso-east-1") + + change_mock = mocker.MagicMock() + change_mock.key = "Tags" + patch_mock = mocker.MagicMock() + + result, fail_reason, action_needed, _ = UpdatePolicy.SUPPORTED_UNLESS_ADC.check(change_mock, patch_mock) + + assert_that(result).is_equal_to(UpdatePolicy.CheckResult.ACTION_NEEDED) + assert_that(fail_reason).contains("ADC") + assert_that(fail_reason).contains("Tags") + assert_that(action_needed).contains("Tags") diff --git a/tests/integration-tests/tests/tags/test_tag_propagation.py b/tests/integration-tests/tests/tags/test_tag_propagation.py index e49a4d99a5..ab592af533 100644 --- a/tests/integration-tests/tests/tags/test_tag_propagation.py +++ b/tests/integration-tests/tests/tags/test_tag_propagation.py @@ -63,8 +63,17 @@ def test_tag_propagation(pcluster_config_reader, clusters_factory, scheduler, os # Makes sure that the compute nodes have started before checking tags _wait_for_compute_fleet_start(cluster) - # Checks for tag propagation - _check_tag_propagation(cluster, scheduler, os, volume_name, add_additional_config_tags=True) + # Checks for tag propagation. + # In ADC regions, CloudFormation doesn't properly update stack tags during a cluster update (see the + # SUPPORTED_UNLESS_ADC update policy), so AdditionalConfigTag is intentionally + # not added to the updated configuration and we skip asserting it here. + _check_tag_propagation( + cluster, + scheduler, + os, + volume_name, + add_additional_config_tags="us-iso" not in cluster.region, + ) _test_queue_and_compute_resources_tags(cluster, pcluster_config_reader, scheduler, os, volume_name) diff --git a/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.queue_update.yaml b/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.queue_update.yaml index f26ba6dbb4..b74bf20f92 100644 --- a/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.queue_update.yaml +++ b/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.queue_update.yaml @@ -1,12 +1,25 @@ Image: Os: {{ os }} Tags: +{% if "us-iso" in region %} + # In ADC regions a CloudFormation bug causes HeadNodeLaunchTemplate to not be + # updated when UpdateStack includes a Tags change together with a metadata-only + # change on the resource, so we avoid adding a tag here (see SUPPORTED_UNLESS_ADC + # update policy). Tags must match pcluster.config.yaml exactly to avoid a diff. + - Key: ConfigFileTag + Value: ConfigFileTagValue + - Key: QueueOverrideTag + Value: ClusterLevelValue + - Key: ComputeOverrideTag + Value: ClusterLevelValue +{% else %} - Key: ComputeOverrideTag Value: ClusterLevelValue - Key: ConfigFileTag Value: ConfigFileTagValue - Key: QueueOverrideTag Value: ClusterLevelValue +{% endif %} HeadNode: InstanceType: {{ instance }} Networking: diff --git a/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.yaml b/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.yaml index e7d09418dd..e8458f36c2 100644 --- a/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.yaml +++ b/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.yaml @@ -1,6 +1,18 @@ Image: Os: {{ os }} Tags: +{% if "us-iso" in region %} + # In ADC regions a CloudFormation bug causes HeadNodeLaunchTemplate to not be + # updated when UpdateStack includes a Tags change together with a metadata-only + # change on the resource, so we avoid adding a tag here (see SUPPORTED_UNLESS_ADC + # update policy). Tags must match pcluster.config.yaml exactly to avoid a diff. + - Key: ConfigFileTag + Value: ConfigFileTagValue + - Key: QueueOverrideTag + Value: ClusterLevelValue + - Key: ComputeOverrideTag + Value: ClusterLevelValue +{% else %} - Key: ComputeOverrideTag Value: ClusterLevelValue - Key: ConfigFileTag @@ -9,6 +21,7 @@ Tags: Value: ClusterLevelValue - Key: AdditionalConfigTag Value: AdditionalConfigTagValue +{% endif %} HeadNode: InstanceType: {{ instance }} Networking: From 9c40b3019fc71d316218c56dd285b0adb0fb7b6f Mon Sep 17 00:00:00 2001 From: hanwenli Date: Mon, 4 May 2026 09:03:56 -0700 Subject: [PATCH 5/9] [integ-tests] Remove force flag during update tags --- tests/integration-tests/tests/tags/test_tag_propagation.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/integration-tests/tests/tags/test_tag_propagation.py b/tests/integration-tests/tests/tags/test_tag_propagation.py index ab592af533..2c224d7891 100644 --- a/tests/integration-tests/tests/tags/test_tag_propagation.py +++ b/tests/integration-tests/tests/tags/test_tag_propagation.py @@ -26,6 +26,7 @@ get_shared_volume_tags, ) from time_utils import minutes, seconds +from utils import wait_for_computefleet_changed from tests.common.schedulers_common import SlurmCommands from tests.common.utils import get_installed_parallelcluster_version @@ -53,10 +54,10 @@ def test_tag_propagation(pcluster_config_reader, clusters_factory, scheduler, os _check_tag_propagation(cluster, scheduler, os, volume_name) cluster.stop() - + wait_for_computefleet_changed(cluster, "STOPPED") # Updates cluster with new configuration updated_cluster_config = pcluster_config_reader(config_file="pcluster.config.update.yaml", volume_name=volume_name) - cluster.update(str(updated_cluster_config), force_update="true") + cluster.update(str(updated_cluster_config)) cluster.start() From e48bf94d1d9c01745adbdb53fa0ef0849162d414 Mon Sep 17 00:00:00 2001 From: hanwenli Date: Tue, 26 May 2026 12:04:31 -0700 Subject: [PATCH 6/9] [integ-tests] Use global-build-number as OS rotation seed Previously, OS rotation was seeded by day-of-year, meaning multiple builds on the same day would always test the same OS. This limited coverage when running tests more than once per day. Now, when --global-build-number is provided and non-zero, it is used as the rotation seed instead. Each build picks a different OS, enabling full coverage across consecutive pipeline runs. Falls back to day-based rotation when global-build-number is None or 0. --- .../tests_configuration/config_renderer.py | 49 +++++++++++++++---- 1 file changed, 39 insertions(+), 10 deletions(-) diff --git a/tests/integration-tests/framework/tests_configuration/config_renderer.py b/tests/integration-tests/framework/tests_configuration/config_renderer.py index 1e4aade100..f2fa7a52f0 100644 --- a/tests/integration-tests/framework/tests_configuration/config_renderer.py +++ b/tests/integration-tests/framework/tests_configuration/config_renderer.py @@ -32,6 +32,23 @@ ) +def _get_global_build_number(config=None, args=None): + """ + Gets the global build number from args or pytest config. + Returns the build number as an int, or 0 if not provided. + """ + global_build_number = 0 + if args: + args_dict = vars(args) + global_build_number = args_dict.get("global_build_number", 0) + elif config: + global_build_number = config.getoption("--global-build-number", default=0) + try: + return int(global_build_number) + except (TypeError, ValueError): + return 0 + + def _get_os_parameters(config=None, args=None): """ Gets OS jinja parameters. @@ -42,9 +59,17 @@ def _get_os_parameters(config=None, args=None): available_amis_oss_x86 = _get_available_amis_oss("x86", config=config, args=args) available_amis_oss_arm = _get_available_amis_oss("arm", config=config, args=args) result = {"AVAILABLE_AMIS_OSS_X86": available_amis_oss_x86, "AVAILABLE_AMIS_OSS_ARM": available_amis_oss_arm} - today_number = (date.today() - date(2020, 1, 1)).days - _propagate_os_jinja_variables("", result, today_number, SUPPORTED_OSES) + # Use global-build-number as the rotation seed if available and non-zero. + # This allows the OS rotation to advance on every build, enabling full coverage + # when running tests multiple times per day. Falls back to day-based rotation otherwise. + global_build_number = _get_global_build_number(config=config, args=args) + if global_build_number: + rotation_seed = global_build_number + else: + rotation_seed = (date.today() - date(2020, 1, 1)).days + + _propagate_os_jinja_variables("", result, rotation_seed, SUPPORTED_OSES) # DCV doesn't support AL2023. Therefore, the following logic makes sure the DCV jinja parameter is not AL2023 dcv_supported_oses = [ @@ -55,23 +80,23 @@ def _get_os_parameters(config=None, args=None): for os in SUPPORTED_OSES if os not in UNSUPPORTED_OSES_FOR_DCV + UNSUPPORTED_ARM_OSES_FOR_DCV + UNSUPPORTED_OSES_FOR_NON_GPU_DCV ] - _propagate_os_jinja_variables("DCV_", result, today_number, dcv_supported_oses, dcv_supported_arm_oses) + _propagate_os_jinja_variables("DCV_", result, rotation_seed, dcv_supported_oses, dcv_supported_arm_oses) lustre_supported_oses = [os for os in SUPPORTED_OSES if os not in UNSUPPORTED_OSES_FOR_LUSTRE] - _propagate_os_jinja_variables("LUSTRE_", result, today_number, lustre_supported_oses) + _propagate_os_jinja_variables("LUSTRE_", result, rotation_seed, lustre_supported_oses) no_rhel_oss = [os for os in SUPPORTED_OSES if "rhel" not in os] - _propagate_os_jinja_variables("NO_RHEL_", result, today_number, no_rhel_oss) + _propagate_os_jinja_variables("NO_RHEL_", result, rotation_seed, no_rhel_oss) no_rocky_oss = [os for os in SUPPORTED_OSES if "rocky" not in os] - _propagate_os_jinja_variables("NO_ROCKY_", result, today_number, no_rocky_oss) + _propagate_os_jinja_variables("NO_ROCKY_", result, rotation_seed, no_rocky_oss) rhel_oss = [os for os in SUPPORTED_OSES if "rhel" in os] - _propagate_os_jinja_variables("RHEL_", result, today_number, rhel_oss) + _propagate_os_jinja_variables("RHEL_", result, rotation_seed, rhel_oss) return result -def _propagate_os_jinja_variables(prefix, result, today_number, supported_x86_oses, supported_arm_oses=None): +def _propagate_os_jinja_variables(prefix, result, rotation_seed, supported_x86_oses, supported_arm_oses=None): available_amis_oss_x86 = result["AVAILABLE_AMIS_OSS_X86"] available_amis_oss_arm = result["AVAILABLE_AMIS_OSS_ARM"] if supported_arm_oses is None: @@ -83,8 +108,12 @@ def _propagate_os_jinja_variables(prefix, result, today_number, supported_x86_os result[f"{prefix}OS_X86"] = available_amis_oss_x86 result[f"{prefix}OS_ARM"] = available_amis_oss_arm for index in range(len(supported_x86_oses)): - result[f"{prefix}OS_X86_{index}"] = available_amis_oss_x86[(today_number + index) % len(available_amis_oss_x86)] - result[f"{prefix}OS_ARM_{index}"] = available_amis_oss_arm[(today_number + index) % len(available_amis_oss_arm)] + result[f"{prefix}OS_X86_{index}"] = available_amis_oss_x86[ + (rotation_seed + index) % len(available_amis_oss_x86) + ] + result[f"{prefix}OS_ARM_{index}"] = available_amis_oss_arm[ + (rotation_seed + index) % len(available_amis_oss_arm) + ] def _get_instance_type_parameters(): # noqa: C901 From 206f98ab7b15209195af86ad65c2e11e92370796 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Fri, 15 May 2026 16:13:56 -0400 Subject: [PATCH 7/9] [Test] Add retry to handle the race condition between test code and cookbook recipies --- .../tests/common/assertions.py | 30 +++++++++++++++---- 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/tests/integration-tests/tests/common/assertions.py b/tests/integration-tests/tests/common/assertions.py index 2043986800..5cbd6bd743 100644 --- a/tests/integration-tests/tests/common/assertions.py +++ b/tests/integration-tests/tests/common/assertions.py @@ -338,6 +338,8 @@ def assert_default_user_has_desired_sudo_access( node_type, region, disable_sudo_access_default_user, + retries=6, + retry_delay=15, ): remote_command_executors = [] logging.info( @@ -354,12 +356,28 @@ def assert_default_user_has_desired_sudo_access( command = "sudo -n cat /etc/sudoers.d/90-cloud-init-users" for node_index, remote_command_executor in enumerate(remote_command_executors): - result = remote_command_executor.run_remote_command(command, raise_on_error=False, timeout=300) - logging.info(f"Default user in {node_type} number {node_index} and result.failed={result.failed}") - logging.info(f"Default user in {node_type} number {node_index} and result.stdout={result.stdout}") - if disable_sudo_access_default_user: - assert_that(result.stdout).contains("a password is required") - assert_that(result.failed).is_equal_to(disable_sudo_access_default_user) + for attempt in range(retries): + result = remote_command_executor.run_remote_command(command, raise_on_error=False, timeout=300) + logging.info( + f"Default user in {node_type} number {node_index} attempt {attempt} " + f"result.failed={result.failed} result.stdout={result.stdout}" + ) + if disable_sudo_access_default_user: + if "a password is required" in result.stdout and result.failed: + logging.info(f"Sudo access correctly disabled on {node_type} number {node_index}") + break + if attempt < retries - 1: + logging.info( + f"Sudo not yet disabled on {node_type} number {node_index}, " + f"retrying in {retry_delay}s (attempt {attempt + 1}/{retries})..." + ) + time.sleep(retry_delay) + else: + assert_that(result.stdout).contains("a password is required") + assert_that(result.failed).is_equal_to(True) + else: + assert_that(result.failed).is_equal_to(False) + break def assert_lambda_vpc_settings_are_correct(stack_name, region, security_group_ids, subnet_ids): From 4b0a3767c6abe94faf9cde98fcf9dfd7ddfd7553 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Thu, 14 May 2026 13:11:15 -0400 Subject: [PATCH 8/9] [Test] Adding the check for fleet status check only for slurm scheduler --- .../integration-tests/tests/tags/test_tag_propagation.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/integration-tests/tests/tags/test_tag_propagation.py b/tests/integration-tests/tests/tags/test_tag_propagation.py index 2c224d7891..e8898f4cbf 100644 --- a/tests/integration-tests/tests/tags/test_tag_propagation.py +++ b/tests/integration-tests/tests/tags/test_tag_propagation.py @@ -54,10 +54,14 @@ def test_tag_propagation(pcluster_config_reader, clusters_factory, scheduler, os _check_tag_propagation(cluster, scheduler, os, volume_name) cluster.stop() - wait_for_computefleet_changed(cluster, "STOPPED") + if scheduler == "slurm": + wait_for_computefleet_changed(cluster, "STOPPED") # Updates cluster with new configuration updated_cluster_config = pcluster_config_reader(config_file="pcluster.config.update.yaml", volume_name=volume_name) - cluster.update(str(updated_cluster_config)) + force_update = None + if scheduler != "slurm": + force_update = "true" + cluster.update(str(updated_cluster_config), force_update=force_update) cluster.start() From 1b626a24e0e6d45dd5944a064c2970489d91ca49 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Thu, 14 May 2026 12:25:49 -0400 Subject: [PATCH 9/9] [Test] Removing upload_github_artifacts_to_s3 which was incorrectly cherry-picked --- tests/integration-tests/tests/createami/test_createami.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration-tests/tests/createami/test_createami.py b/tests/integration-tests/tests/createami/test_createami.py index 57c6b8491a..d797b3dacb 100644 --- a/tests/integration-tests/tests/createami/test_createami.py +++ b/tests/integration-tests/tests/createami/test_createami.py @@ -43,7 +43,6 @@ get_installed_parallelcluster_base_version, get_installed_parallelcluster_version, retrieve_latest_ami, - upload_github_artifacts_to_s3, ) from tests.proxy.test_proxy import proxy_stack_factory # noqa: F401