diff --git a/CHANGELOG.md b/CHANGELOG.md index 624dade051..db12315fd9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,7 +32,7 @@ CHANGELOG - Replace cfn-hup in compute nodes with systemd timer to support in place updates in order to improve performance for tightly coupled worloads at scale. This new mechanism relies on shared storage to sync updates between the head node and compute nodes. - Disable `dnf-makecache.timer` to improve performance for tightly coupled worloads on RHEL/Rocky at scale. -- Support updates of `Tags` during cluster-updates. +- Support updates of `Tags` during cluster-updates except in AWS Top Secret and AWS Secret Regions. - Add `LaunchTemplateOverrides` to cluster config to allow network interfaces to be customized by overriding the launch template of a compute resource. - This overrides the parallelcluster default using a shallow merge. - Add alarm on missing clustermgtd heartbeat. diff --git a/cli/src/pcluster/config/update_policy.py b/cli/src/pcluster/config/update_policy.py index 4ff16a58e9..cd7afd94ea 100644 --- a/cli/src/pcluster/config/update_policy.py +++ b/cli/src/pcluster/config/update_policy.py @@ -11,6 +11,7 @@ import re from enum import Enum +from pcluster.aws.common import get_region from pcluster.config.cluster_config import QueueUpdateStrategy from pcluster.config.update_policy_utils import SharedStorageChangeInfo from pcluster.constants import ( @@ -147,6 +148,12 @@ def is_slurm_queues_change(change): return any(path.startswith("SlurmQueues[") for path in change.path) +def _is_adc_region(): + """Return True if the currently configured AWS region belongs to an ADC partition.""" + region = get_region() or "" + return region.startswith("us-iso") + + def extract_type_and_name_from_path(path): # Example path = 'SlurmQueues[slurm-q-name]' # This function returns the type and name extracted like this: 'SlurmQueues', 'slurm-q-name' @@ -696,6 +703,25 @@ def fail_reason_extra_chef_attributes(change, _) -> str: + ". If you need this change, please consider creating a new cluster instead of updating the existing one.", ) +# Update supported everywhere except ADC regions (us-iso*, us-isob*). +# +# In ADC regions there is a CloudFormation behavior where an UpdateStack call that includes +# both a Tags change and a resource whose change is only in Metadata does not update that +# resource. This breaks the head node update flow because cfn-hup on the head node polls +# DescribeStackResource for Metadata changes on HeadNodeLaunchTemplate and never sees them, +# leaving the HeadNodeWaitCondition to time out. +# +# Until the CloudFormation behavior is fixed in ADC, block tag updates in those regions. +UpdatePolicy.SUPPORTED_UNLESS_ADC = UpdatePolicy( + name="SUPPORTED_UNLESS_ADC", + level=1000, + fail_reason=lambda change, patch: ( + f"Updating '{change.key}' during a cluster update is not supported in ADC regions." + ), + action_needed=lambda change, patch: f"Restore '{change.key}' to its previous value.", + condition_checker=lambda change, patch: not _is_adc_region(), +) + # Block update if cluster has a managed Fsx for Lustre FileSystem, otherwise fallback to QueueUpdateStrategy UpdatePolicy.MANAGED_FSX = UpdatePolicy( name="MANAGED_FSX", diff --git a/cli/src/pcluster/schemas/cluster_schema.py b/cli/src/pcluster/schemas/cluster_schema.py index 601535ec8b..35762f002e 100644 --- a/cli/src/pcluster/schemas/cluster_schema.py +++ b/cli/src/pcluster/schemas/cluster_schema.py @@ -1866,7 +1866,11 @@ class ClusterSchema(BaseSchema): monitoring = fields.Nested(MonitoringSchema, metadata={"update_policy": UpdatePolicy.IGNORED}) additional_packages = fields.Nested(AdditionalPackagesSchema, metadata={"update_policy": UpdatePolicy.UNSUPPORTED}) - tags = fields.Nested(TagSchema, many=True, metadata={"update_policy": UpdatePolicy.SUPPORTED, "update_key": "Key"}) + tags = fields.Nested( + TagSchema, + many=True, + metadata={"update_policy": UpdatePolicy.SUPPORTED_UNLESS_ADC, "update_key": "Key"}, + ) iam = fields.Nested(ClusterIamSchema, metadata={"update_policy": UpdatePolicy.IGNORED}) directory_service = fields.Nested( DirectoryServiceSchema, metadata={"update_policy": UpdatePolicy.COMPUTE_AND_LOGIN_NODES_STOP} diff --git a/cli/src/pcluster/schemas/common_schema.py b/cli/src/pcluster/schemas/common_schema.py index 32d6bf1146..da2030f244 100644 --- a/cli/src/pcluster/schemas/common_schema.py +++ b/cli/src/pcluster/schemas/common_schema.py @@ -183,7 +183,7 @@ class TagSchema(BaseSchema): value = fields.Str( required=True, validate=validate.Length(max=256), - metadata={"update_policy": UpdatePolicy.SUPPORTED}, + metadata={"update_policy": UpdatePolicy.SUPPORTED_UNLESS_ADC}, ) @post_load diff --git a/cli/tests/pcluster/config/test_config_patch.py b/cli/tests/pcluster/config/test_config_patch.py index 2a89c314cc..d38c97eba1 100644 --- a/cli/tests/pcluster/config/test_config_patch.py +++ b/cli/tests/pcluster/config/test_config_patch.py @@ -1154,11 +1154,11 @@ def test_patch_check_cluster_resource_bucket( "Tags", None, {"Key": "test3", "Value": "val3"}, - UpdatePolicy.SUPPORTED, + UpdatePolicy.SUPPORTED_UNLESS_ADC, is_list=True, ) ], - UpdatePolicy.SUPPORTED, + UpdatePolicy.SUPPORTED_UNLESS_ADC, id="tag_addition", ), pytest.param( @@ -1170,11 +1170,11 @@ def test_patch_check_cluster_resource_bucket( "Tags", {"Key": "test2", "Value": "val2"}, None, - UpdatePolicy.SUPPORTED, + UpdatePolicy.SUPPORTED_UNLESS_ADC, is_list=True, ) ], - UpdatePolicy.SUPPORTED, + UpdatePolicy.SUPPORTED_UNLESS_ADC, id="tag_removal", ), pytest.param( @@ -1186,11 +1186,11 @@ def test_patch_check_cluster_resource_bucket( "Value", "old_value", "new_value", - UpdatePolicy.SUPPORTED, + UpdatePolicy.SUPPORTED_UNLESS_ADC, is_list=False, ) ], - UpdatePolicy.SUPPORTED, + UpdatePolicy.SUPPORTED_UNLESS_ADC, id="tag_value_modification", ), pytest.param( @@ -1202,11 +1202,11 @@ def test_patch_check_cluster_resource_bucket( "Tags", None, {"Key": "test3", "Value": "val3"}, - UpdatePolicy.SUPPORTED, + UpdatePolicy.SUPPORTED_UNLESS_ADC, is_list=True, ) ], - UpdatePolicy.SUPPORTED, + UpdatePolicy.SUPPORTED_UNLESS_ADC, id="order_change_plus_addition", ), pytest.param( diff --git a/cli/tests/pcluster/config/test_update_policy.py b/cli/tests/pcluster/config/test_update_policy.py index b88d9d0668..9e47e3b357 100644 --- a/cli/tests/pcluster/config/test_update_policy.py +++ b/cli/tests/pcluster/config/test_update_policy.py @@ -3044,3 +3044,44 @@ def test_head_node_local_storage_update_blocked( if expected_action_needed: assert_that(action_needed_messages).is_not_empty() assert_that(action_needed_messages[0]).is_equal_to(expected_action_needed) + + +@pytest.mark.parametrize( + "region, expected_result", + [ + pytest.param("us-east-1", True, id="commercial region allows tag update"), + pytest.param("eu-west-1", True, id="commercial EU region allows tag update"), + pytest.param("us-gov-west-1", True, id="GovCloud region allows tag update"), + pytest.param("cn-north-1", True, id="China region allows tag update"), + pytest.param("us-iso-east-1", False, id="us-iso region blocks tag update"), + pytest.param("us-iso-west-1", False, id="us-iso-west region blocks tag update"), + pytest.param("us-isob-east-1", False, id="us-isob region blocks tag update"), + pytest.param("", True, id="empty region falls back to allowing update"), + ], +) +def test_supported_unless_adc_condition_checker(mocker, region, expected_result): + """SUPPORTED_UNLESS_ADC blocks updates only in ADC partitions (us-iso*, us-isob*).""" + mocker.patch("pcluster.config.update_policy.get_region", return_value=region) + + change_mock = mocker.MagicMock() + patch_mock = mocker.MagicMock() + + assert_that(UpdatePolicy.SUPPORTED_UNLESS_ADC.condition_checker(change_mock, patch_mock)).is_equal_to( + expected_result + ) + + +def test_supported_unless_adc_fail_reason_mentions_adc(mocker): + """The fail reason for SUPPORTED_UNLESS_ADC explains the ADC limitation.""" + mocker.patch("pcluster.config.update_policy.get_region", return_value="us-iso-east-1") + + change_mock = mocker.MagicMock() + change_mock.key = "Tags" + patch_mock = mocker.MagicMock() + + result, fail_reason, action_needed, _ = UpdatePolicy.SUPPORTED_UNLESS_ADC.check(change_mock, patch_mock) + + assert_that(result).is_equal_to(UpdatePolicy.CheckResult.ACTION_NEEDED) + assert_that(fail_reason).contains("ADC") + assert_that(fail_reason).contains("Tags") + assert_that(action_needed).contains("Tags") diff --git a/tests/integration-tests/configs/isolated_regions.yaml b/tests/integration-tests/configs/isolated_regions.yaml index e2bd946ae0..15a594bf62 100644 --- a/tests/integration-tests/configs/isolated_regions.yaml +++ b/tests/integration-tests/configs/isolated_regions.yaml @@ -599,13 +599,15 @@ test-suites: instances: {{ INSTANCES }} oss: {{ OSS }} schedulers: {{ SCHEDULERS }} - proxy: - test_proxy.py::test_proxy: - dimensions: - - regions: {{ REGIONS }} - instances: {{ INSTANCES }} - oss: {{ OSS }} - schedulers: {{ SCHEDULERS }} +# This test has never passed in isolated regions because our proxy instance setup logic in the test needs to be adjusted +# If proxy instance is set up correctly, pcluster should be able to use proxy in isolated regions +# proxy: +# test_proxy.py::test_proxy: +# dimensions: +# - regions: {{ REGIONS }} +# instances: {{ INSTANCES }} +# oss: {{ OSS }} +# schedulers: {{ SCHEDULERS }} pyxis: test_pyxis.py::test_pyxis: dimensions: diff --git a/tests/integration-tests/framework/tests_configuration/config_renderer.py b/tests/integration-tests/framework/tests_configuration/config_renderer.py index 1e4aade100..f2fa7a52f0 100644 --- a/tests/integration-tests/framework/tests_configuration/config_renderer.py +++ b/tests/integration-tests/framework/tests_configuration/config_renderer.py @@ -32,6 +32,23 @@ ) +def _get_global_build_number(config=None, args=None): + """ + Gets the global build number from args or pytest config. + Returns the build number as an int, or 0 if not provided. + """ + global_build_number = 0 + if args: + args_dict = vars(args) + global_build_number = args_dict.get("global_build_number", 0) + elif config: + global_build_number = config.getoption("--global-build-number", default=0) + try: + return int(global_build_number) + except (TypeError, ValueError): + return 0 + + def _get_os_parameters(config=None, args=None): """ Gets OS jinja parameters. @@ -42,9 +59,17 @@ def _get_os_parameters(config=None, args=None): available_amis_oss_x86 = _get_available_amis_oss("x86", config=config, args=args) available_amis_oss_arm = _get_available_amis_oss("arm", config=config, args=args) result = {"AVAILABLE_AMIS_OSS_X86": available_amis_oss_x86, "AVAILABLE_AMIS_OSS_ARM": available_amis_oss_arm} - today_number = (date.today() - date(2020, 1, 1)).days - _propagate_os_jinja_variables("", result, today_number, SUPPORTED_OSES) + # Use global-build-number as the rotation seed if available and non-zero. + # This allows the OS rotation to advance on every build, enabling full coverage + # when running tests multiple times per day. Falls back to day-based rotation otherwise. + global_build_number = _get_global_build_number(config=config, args=args) + if global_build_number: + rotation_seed = global_build_number + else: + rotation_seed = (date.today() - date(2020, 1, 1)).days + + _propagate_os_jinja_variables("", result, rotation_seed, SUPPORTED_OSES) # DCV doesn't support AL2023. Therefore, the following logic makes sure the DCV jinja parameter is not AL2023 dcv_supported_oses = [ @@ -55,23 +80,23 @@ def _get_os_parameters(config=None, args=None): for os in SUPPORTED_OSES if os not in UNSUPPORTED_OSES_FOR_DCV + UNSUPPORTED_ARM_OSES_FOR_DCV + UNSUPPORTED_OSES_FOR_NON_GPU_DCV ] - _propagate_os_jinja_variables("DCV_", result, today_number, dcv_supported_oses, dcv_supported_arm_oses) + _propagate_os_jinja_variables("DCV_", result, rotation_seed, dcv_supported_oses, dcv_supported_arm_oses) lustre_supported_oses = [os for os in SUPPORTED_OSES if os not in UNSUPPORTED_OSES_FOR_LUSTRE] - _propagate_os_jinja_variables("LUSTRE_", result, today_number, lustre_supported_oses) + _propagate_os_jinja_variables("LUSTRE_", result, rotation_seed, lustre_supported_oses) no_rhel_oss = [os for os in SUPPORTED_OSES if "rhel" not in os] - _propagate_os_jinja_variables("NO_RHEL_", result, today_number, no_rhel_oss) + _propagate_os_jinja_variables("NO_RHEL_", result, rotation_seed, no_rhel_oss) no_rocky_oss = [os for os in SUPPORTED_OSES if "rocky" not in os] - _propagate_os_jinja_variables("NO_ROCKY_", result, today_number, no_rocky_oss) + _propagate_os_jinja_variables("NO_ROCKY_", result, rotation_seed, no_rocky_oss) rhel_oss = [os for os in SUPPORTED_OSES if "rhel" in os] - _propagate_os_jinja_variables("RHEL_", result, today_number, rhel_oss) + _propagate_os_jinja_variables("RHEL_", result, rotation_seed, rhel_oss) return result -def _propagate_os_jinja_variables(prefix, result, today_number, supported_x86_oses, supported_arm_oses=None): +def _propagate_os_jinja_variables(prefix, result, rotation_seed, supported_x86_oses, supported_arm_oses=None): available_amis_oss_x86 = result["AVAILABLE_AMIS_OSS_X86"] available_amis_oss_arm = result["AVAILABLE_AMIS_OSS_ARM"] if supported_arm_oses is None: @@ -83,8 +108,12 @@ def _propagate_os_jinja_variables(prefix, result, today_number, supported_x86_os result[f"{prefix}OS_X86"] = available_amis_oss_x86 result[f"{prefix}OS_ARM"] = available_amis_oss_arm for index in range(len(supported_x86_oses)): - result[f"{prefix}OS_X86_{index}"] = available_amis_oss_x86[(today_number + index) % len(available_amis_oss_x86)] - result[f"{prefix}OS_ARM_{index}"] = available_amis_oss_arm[(today_number + index) % len(available_amis_oss_arm)] + result[f"{prefix}OS_X86_{index}"] = available_amis_oss_x86[ + (rotation_seed + index) % len(available_amis_oss_x86) + ] + result[f"{prefix}OS_ARM_{index}"] = available_amis_oss_arm[ + (rotation_seed + index) % len(available_amis_oss_arm) + ] def _get_instance_type_parameters(): # noqa: C901 diff --git a/tests/integration-tests/tests/common/assertions.py b/tests/integration-tests/tests/common/assertions.py index 2043986800..5cbd6bd743 100644 --- a/tests/integration-tests/tests/common/assertions.py +++ b/tests/integration-tests/tests/common/assertions.py @@ -338,6 +338,8 @@ def assert_default_user_has_desired_sudo_access( node_type, region, disable_sudo_access_default_user, + retries=6, + retry_delay=15, ): remote_command_executors = [] logging.info( @@ -354,12 +356,28 @@ def assert_default_user_has_desired_sudo_access( command = "sudo -n cat /etc/sudoers.d/90-cloud-init-users" for node_index, remote_command_executor in enumerate(remote_command_executors): - result = remote_command_executor.run_remote_command(command, raise_on_error=False, timeout=300) - logging.info(f"Default user in {node_type} number {node_index} and result.failed={result.failed}") - logging.info(f"Default user in {node_type} number {node_index} and result.stdout={result.stdout}") - if disable_sudo_access_default_user: - assert_that(result.stdout).contains("a password is required") - assert_that(result.failed).is_equal_to(disable_sudo_access_default_user) + for attempt in range(retries): + result = remote_command_executor.run_remote_command(command, raise_on_error=False, timeout=300) + logging.info( + f"Default user in {node_type} number {node_index} attempt {attempt} " + f"result.failed={result.failed} result.stdout={result.stdout}" + ) + if disable_sudo_access_default_user: + if "a password is required" in result.stdout and result.failed: + logging.info(f"Sudo access correctly disabled on {node_type} number {node_index}") + break + if attempt < retries - 1: + logging.info( + f"Sudo not yet disabled on {node_type} number {node_index}, " + f"retrying in {retry_delay}s (attempt {attempt + 1}/{retries})..." + ) + time.sleep(retry_delay) + else: + assert_that(result.stdout).contains("a password is required") + assert_that(result.failed).is_equal_to(True) + else: + assert_that(result.failed).is_equal_to(False) + break def assert_lambda_vpc_settings_are_correct(stack_name, region, security_group_ids, subnet_ids): diff --git a/tests/integration-tests/tests/common/nccl_common.py b/tests/integration-tests/tests/common/nccl_common.py index 355ab9cf08..f2d7a32bc6 100644 --- a/tests/integration-tests/tests/common/nccl_common.py +++ b/tests/integration-tests/tests/common/nccl_common.py @@ -17,7 +17,7 @@ from assertpy import assert_that from utils import get_instance_info -from tests.performance_tests.common import push_result_to_dynamodb +from tests.common.utils import push_result_to_dynamodb NCCL_COMMON_DATADIR = pathlib.Path(__file__).parent / "data/nccl/" diff --git a/tests/integration-tests/tests/common/utils.py b/tests/integration-tests/tests/common/utils.py index be8762aa54..877a6ac068 100644 --- a/tests/integration-tests/tests/common/utils.py +++ b/tests/integration-tests/tests/common/utils.py @@ -15,11 +15,14 @@ import random import string import time +import uuid from importlib.metadata import version as get_package_version import boto3 from assertpy import assert_that from botocore.exceptions import ClientError +from framework.framework_constants import METADATA_DEFAULT_REGION, PERFORMANCE_METADATA_TABLE +from framework.metadata_table_manager import MetadataTableManager from packaging import version as packaging_version from remote_command_executor import RemoteCommandExecutionError, RemoteCommandExecutor from retrying import retry @@ -644,6 +647,38 @@ def get_capacity_reservation_id(request, instance_type, region, count, os): return reservations_ids +def push_result_to_dynamodb(name, result, instance, os, mpi_variation=None, num_instances=None): + reporting_region = METADATA_DEFAULT_REGION + logging.info(f"Metadata reporting region {reporting_region}") + # Create the metadata table in case it doesn't exist + MetadataTableManager(reporting_region, PERFORMANCE_METADATA_TABLE).create_metadata_table() + try: + # Create DynamoDB resource + dynamodb = boto3.resource("dynamodb", region_name=reporting_region) + table = dynamodb.Table(PERFORMANCE_METADATA_TABLE) + + # Prepare item to be inserted + item = { + "id": str(uuid.uuid4().hex), + "name": name, + "instance": instance, + "os": os, + "timestamp": int(time.time()), + "result": str(result), + "pcluster_version": f"v{get_installed_parallelcluster_version()}", + "mpi_variation": str(mpi_variation), + "num_instances": num_instances, + } + + # Put item in the table + table.put_item(Item=item) + logging.info(f"Successfully pushed result to DynamoDB with id: {item['id']}") + + except Exception as e: + logging.error(f"Failed to push result to DynamoDB: {str(e)}") + raise + + @retry(stop_max_attempt_number=3, wait_fixed=seconds(10)) def _download_and_upload_to_s3(url, bucket_name, s3_key, s3_client): """Download a file from a URL and upload it to S3, with retries for transient network failures.""" diff --git a/tests/integration-tests/tests/createami/test_createami.py b/tests/integration-tests/tests/createami/test_createami.py index 57c6b8491a..d797b3dacb 100644 --- a/tests/integration-tests/tests/createami/test_createami.py +++ b/tests/integration-tests/tests/createami/test_createami.py @@ -43,7 +43,6 @@ get_installed_parallelcluster_base_version, get_installed_parallelcluster_version, retrieve_latest_ami, - upload_github_artifacts_to_s3, ) from tests.proxy.test_proxy import proxy_stack_factory # noqa: F401 diff --git a/tests/integration-tests/tests/performance_tests/common.py b/tests/integration-tests/tests/performance_tests/common.py index 3441d0f940..ac7384875f 100644 --- a/tests/integration-tests/tests/performance_tests/common.py +++ b/tests/integration-tests/tests/performance_tests/common.py @@ -3,20 +3,16 @@ import os as os_lib import pathlib import shutil -import time -import uuid from math import ceil from os import makedirs import boto3 from assertpy import assert_that -from framework.framework_constants import METADATA_DEFAULT_REGION, PERFORMANCE_METADATA_TABLE -from framework.metadata_table_manager import MetadataTableManager from retrying import retry from time_utils import seconds from utils import get_username_for_os -from tests.common.utils import fetch_instance_slots, get_installed_parallelcluster_version +from tests.common.utils import fetch_instance_slots # Common settings used by all test scenarios # You can change these variables to adapt the performance test to your needs. @@ -253,35 +249,3 @@ def _log_output_performance_difference(node, performance_degradation, observed_v f"Nodes: {node}, Baseline: {baseline_value} seconds, Observed: {observed_value} seconds, " f"Percentage difference: {percentage_difference}%, Outcome: {outcome}" ) - - -def push_result_to_dynamodb(name, result, instance, os, mpi_variation=None, num_instances=None): - reporting_region = METADATA_DEFAULT_REGION - logging.info(f"Metadata reporting region {reporting_region}") - # Create the metadata table in case it doesn't exist - MetadataTableManager(reporting_region, PERFORMANCE_METADATA_TABLE).create_metadata_table() - try: - # Create DynamoDB resource - dynamodb = boto3.resource("dynamodb", region_name=reporting_region) - table = dynamodb.Table(PERFORMANCE_METADATA_TABLE) - - # Prepare item to be inserted - item = { - "id": str(uuid.uuid4().hex), - "name": name, - "instance": instance, - "os": os, - "timestamp": int(time.time()), - "result": str(result), - "pcluster_version": f"v{get_installed_parallelcluster_version()}", - "mpi_variation": str(mpi_variation), - "num_instances": num_instances, - } - - # Put item in the table - table.put_item(Item=item) - logging.info(f"Successfully pushed result to DynamoDB with id: {item['id']}") - - except Exception as e: - logging.error(f"Failed to push result to DynamoDB: {str(e)}") - raise diff --git a/tests/integration-tests/tests/performance_tests/test_openfoam.py b/tests/integration-tests/tests/performance_tests/test_openfoam.py index 811b585c28..0a7fa2ce3d 100644 --- a/tests/integration-tests/tests/performance_tests/test_openfoam.py +++ b/tests/integration-tests/tests/performance_tests/test_openfoam.py @@ -4,7 +4,8 @@ import pytest from remote_command_executor import RemoteCommandExecutionError, RemoteCommandExecutor -from tests.performance_tests.common import _log_output_performance_difference, push_result_to_dynamodb +from tests.common.utils import push_result_to_dynamodb +from tests.performance_tests.common import _log_output_performance_difference # timeout in seconds OPENFOAM_INSTALLATION_TIMEOUT = 300 diff --git a/tests/integration-tests/tests/performance_tests/test_osu.py b/tests/integration-tests/tests/performance_tests/test_osu.py index b996831c10..ec7d802187 100644 --- a/tests/integration-tests/tests/performance_tests/test_osu.py +++ b/tests/integration-tests/tests/performance_tests/test_osu.py @@ -22,10 +22,10 @@ from tests.common.utils import ( fetch_instance_slots, get_capacity_reservation_id, + push_result_to_dynamodb, run_system_analyzer, write_file, ) -from tests.performance_tests.common import push_result_to_dynamodb # We collected OSU benchmarks results for these instance types OSU_BENCHMARKS_INSTANCES = ["c5n.18xlarge", "p5en.48xlarge", "p6-b200.48xlarge"] diff --git a/tests/integration-tests/tests/performance_tests/test_starccm.py b/tests/integration-tests/tests/performance_tests/test_starccm.py index 77f8d97ed5..1935a3cf55 100644 --- a/tests/integration-tests/tests/performance_tests/test_starccm.py +++ b/tests/integration-tests/tests/performance_tests/test_starccm.py @@ -5,8 +5,13 @@ import pytest from remote_command_executor import RemoteCommandExecutionError, RemoteCommandExecutor -from tests.common.utils import assert_no_file_handler_leak, fetch_instance_slots, get_compute_ip_to_num_files -from tests.performance_tests.common import _log_output_performance_difference, push_result_to_dynamodb +from tests.common.utils import ( + assert_no_file_handler_leak, + fetch_instance_slots, + get_compute_ip_to_num_files, + push_result_to_dynamodb, +) +from tests.performance_tests.common import _log_output_performance_difference # timeout in seconds STARCCM_INSTALLATION_TIMEOUT = 1800 diff --git a/tests/integration-tests/tests/tags/test_tag_propagation.py b/tests/integration-tests/tests/tags/test_tag_propagation.py index 290f8ec186..e8898f4cbf 100644 --- a/tests/integration-tests/tests/tags/test_tag_propagation.py +++ b/tests/integration-tests/tests/tags/test_tag_propagation.py @@ -26,6 +26,7 @@ get_shared_volume_tags, ) from time_utils import minutes, seconds +from utils import wait_for_computefleet_changed from tests.common.schedulers_common import SlurmCommands from tests.common.utils import get_installed_parallelcluster_version @@ -53,18 +54,31 @@ def test_tag_propagation(pcluster_config_reader, clusters_factory, scheduler, os _check_tag_propagation(cluster, scheduler, os, volume_name) cluster.stop() - + if scheduler == "slurm": + wait_for_computefleet_changed(cluster, "STOPPED") # Updates cluster with new configuration updated_cluster_config = pcluster_config_reader(config_file="pcluster.config.update.yaml", volume_name=volume_name) - cluster.update(str(updated_cluster_config), force_update="true") + force_update = None + if scheduler != "slurm": + force_update = "true" + cluster.update(str(updated_cluster_config), force_update=force_update) cluster.start() # Makes sure that the compute nodes have started before checking tags _wait_for_compute_fleet_start(cluster) - # Checks for tag propagation - _check_tag_propagation(cluster, scheduler, os, volume_name, add_additional_config_tags=True) + # Checks for tag propagation. + # In ADC regions, CloudFormation doesn't properly update stack tags during a cluster update (see the + # SUPPORTED_UNLESS_ADC update policy), so AdditionalConfigTag is intentionally + # not added to the updated configuration and we skip asserting it here. + _check_tag_propagation( + cluster, + scheduler, + os, + volume_name, + add_additional_config_tags="us-iso" not in cluster.region, + ) _test_queue_and_compute_resources_tags(cluster, pcluster_config_reader, scheduler, os, volume_name) @@ -127,6 +141,7 @@ def _check_tag_propagation( {"Name": "HeadNode", "parallelcluster:node-type": "HeadNode"}, ), "tag_getter_kwargs": {"cluster": cluster, "os": os}, + "skip": "us-iso" in cluster.region, }, { "resource": "Compute Node", diff --git a/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.queue_update.yaml b/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.queue_update.yaml index f26ba6dbb4..b74bf20f92 100644 --- a/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.queue_update.yaml +++ b/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.queue_update.yaml @@ -1,12 +1,25 @@ Image: Os: {{ os }} Tags: +{% if "us-iso" in region %} + # In ADC regions a CloudFormation bug causes HeadNodeLaunchTemplate to not be + # updated when UpdateStack includes a Tags change together with a metadata-only + # change on the resource, so we avoid adding a tag here (see SUPPORTED_UNLESS_ADC + # update policy). Tags must match pcluster.config.yaml exactly to avoid a diff. + - Key: ConfigFileTag + Value: ConfigFileTagValue + - Key: QueueOverrideTag + Value: ClusterLevelValue + - Key: ComputeOverrideTag + Value: ClusterLevelValue +{% else %} - Key: ComputeOverrideTag Value: ClusterLevelValue - Key: ConfigFileTag Value: ConfigFileTagValue - Key: QueueOverrideTag Value: ClusterLevelValue +{% endif %} HeadNode: InstanceType: {{ instance }} Networking: diff --git a/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.yaml b/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.yaml index e7d09418dd..e8458f36c2 100644 --- a/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.yaml +++ b/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.yaml @@ -1,6 +1,18 @@ Image: Os: {{ os }} Tags: +{% if "us-iso" in region %} + # In ADC regions a CloudFormation bug causes HeadNodeLaunchTemplate to not be + # updated when UpdateStack includes a Tags change together with a metadata-only + # change on the resource, so we avoid adding a tag here (see SUPPORTED_UNLESS_ADC + # update policy). Tags must match pcluster.config.yaml exactly to avoid a diff. + - Key: ConfigFileTag + Value: ConfigFileTagValue + - Key: QueueOverrideTag + Value: ClusterLevelValue + - Key: ComputeOverrideTag + Value: ClusterLevelValue +{% else %} - Key: ComputeOverrideTag Value: ClusterLevelValue - Key: ConfigFileTag @@ -9,6 +21,7 @@ Tags: Value: ClusterLevelValue - Key: AdditionalConfigTag Value: AdditionalConfigTagValue +{% endif %} HeadNode: InstanceType: {{ instance }} Networking: