From 3790ad02933cf4fa8a7c7aa9862e08e33248a106 Mon Sep 17 00:00:00 2001 From: "dd-octo-sts[bot]" <200755185+dd-octo-sts[bot]@users.noreply.github.com> Date: Fri, 22 May 2026 17:37:04 +0000 Subject: [PATCH] [autoscaling] Re-sync local-owner DPAs on annotation changes (#51173) ### What does this PR do? MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes a bug in the workload autoscaling controller where annotation-only edits on a locally-owned `DatadogPodAutoscaler` were silently ignored until the next `.spec` change or cluster-agent restart. The Local-owner branch in `syncPodAutoscaler` previously gated `UpdateFromPodAutoscaler` on `.metadata.generation` changes. Annotation edits don't bump `.metadata.generation`, so adding/changing `autoscaling.datadoghq.com/preview` (or any other annotation the controller reads) did nothing — `IsBurstable()` and the parsed `previewOptions` stayed stale, and the burstable CPU-limit removal never took effect. The gate is replaced by `NeedsResyncFromPodAutoscaler`, which compares: - `.metadata.generation` - the profile label (`autoscaling.datadoghq.com/profile`) - every annotation consumed by `UpdateFromPodAutoscaler` (`preview`, `profile-template-hash`, `custom-recommender`) The list of relevant annotation keys (`resyncMetadataKeysFromPodAutoscaler`) lives next to `UpdateFromPodAutoscaler` with an explicit comment to keep the two in sync. ### Motivation Customer report: a `DatadogPodAutoscaler` with `owner: Local` had the burstable preview annotation added post-creation, but the workload's CPU limits were never stripped. Root cause traced to the generation-only gate at `controller.go` Local-owner branch — the parsing in `UpdateFromPodAutoscaler` was never re-invoked after the annotation appeared. ### Describe how you validated your changes **Unit tests** (`pkg/clusteragent/autoscaling/workload/model/`): - New `TestNeedsResyncFromPodAutoscaler` with 10 sub-tests covering: no cached upstream, identical object (no resync), generation bump, preview annotation added/changed/removed, profile label change, profile-template-hash change, custom-recommender change, and an irrelevant annotation that must NOT trigger a resync. **Full suite** (`./pkg/clusteragent/autoscaling/...`): all 14 packages green. **End-to-end on a kind cluster** with a cluster-agent built from this branch: - Created a Local DPA with no preview annotation, observed initial state. - Ran `kubectl annotate dpa ... autoscaling.datadoghq.com/preview='{\"burstable\":true}'` — the controller logged a re-sync at generation 1, no spec change. - Ran `kubectl annotate dpa ... autoscaling.datadoghq.com/preview-` to remove it — the controller logged a re-sync again at generation 1. - Confirmed `previewOptions.Burstable` flipped on both transitions. Before this change, the same scenario produced no re-sync log and `IsBurstable()` stayed false. ### Additional Notes - Workaround for customers on an affected agent: `kubectl -n datadog rollout restart deploy/datadog-cluster-agent`, or any `.spec` edit to force a generation bump. - The `agent autoscaler-list` CLI prints `Burstable: false` for all DPAs regardless of state because `previewOptions` is unexported and stripped during IPC serialization; that is a separate issue tracked outside this PR. 🤖 Assisted by Claude:claude-opus-4-7 Co-authored-by: cedric.lamoriniere (cherry picked from commit 8dde25f8a57b5fb9d323899139ae84fbb099df4f) ___ Co-authored-by: Cedric Lamoriniere --- .../autoscaling/workload/controller.go | 8 ++-- .../workload/model/pod_autoscaler.go | 44 ++++++++++++++++++- .../workload/model/pod_autoscaler_test.go | 28 ++++++++++++ .../model/pod_autoscaler_test_utils.go | 10 +++++ ...pa-annotation-resync-40deef59a1cde0aa.yaml | 9 ++++ 5 files changed, 94 insertions(+), 5 deletions(-) create mode 100644 releasenotes/notes/fix-dpa-annotation-resync-40deef59a1cde0aa.yaml diff --git a/pkg/clusteragent/autoscaling/workload/controller.go b/pkg/clusteragent/autoscaling/workload/controller.go index 7a7eb83c541f..47a115866259 100644 --- a/pkg/clusteragent/autoscaling/workload/controller.go +++ b/pkg/clusteragent/autoscaling/workload/controller.go @@ -393,10 +393,10 @@ func (c *Controller) syncPodAutoscaler(ctx context.Context, key, ns, name string // Fall through to normal scaling logic. } else if podAutoscalerInternal.Spec().Owner == datadoghqcommon.DatadogPodAutoscalerLocalOwner { - // Implement sync logic for local ownership, source of truth is Kubernetes - if podAutoscalerInternal.Generation() != podAutoscaler.Generation { - podAutoscalerInternal.UpdateFromPodAutoscaler(podAutoscaler) - } + // Sync logic for local ownership: Kubernetes is the source of truth. + // UpdateFromPodAutoscaler internally short-circuits when neither .metadata.generation + // nor any of the watched labels/annotations has changed. + podAutoscalerInternal.UpdateFromPodAutoscaler(podAutoscaler) } // Reaching this point, we had no errors in processing, clearing up global error diff --git a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler.go b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler.go index cafa3a5fbd47..fa92667ecdc7 100644 --- a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler.go +++ b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler.go @@ -20,6 +20,7 @@ import ( "github.com/DataDog/datadog-agent/pkg/clusteragent/autoscaling" "github.com/DataDog/datadog-agent/pkg/util/pointer" + "github.com/twmb/murmur3" autoscalingv2 "k8s.io/api/autoscaling/v2" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -45,6 +46,18 @@ const ( CustomRecommenderAnnotationKey = "autoscaling.datadoghq.com/custom-recommender" ) +// resyncLabelKeysFromPodAutoscaler lists the label keys read by UpdateFromPodAutoscaler. +var resyncLabelKeysFromPodAutoscaler = []string{ + ProfileLabelKey, +} + +// resyncAnnotationKeysFromPodAutoscaler lists the annotation keys read by UpdateFromPodAutoscaler. +var resyncAnnotationKeysFromPodAutoscaler = []string{ + PreviewAnnotationKey, + ProfileTemplateHashAnnotation, + CustomRecommenderAnnotationKey, +} + // PodAutoscalerInternal holds the necessary data to work with the `DatadogPodAutoscaler` CRD. type PodAutoscalerInternal struct { // namespace is the namespace of the PodAutoscaler @@ -83,6 +96,11 @@ type PodAutoscalerInternal struct { // previewOptions holds the parsed preview feature flags from the DPA annotations previewOptions previewOptions + // metadataHash fingerprints the K8s state read by UpdateFromPodAutoscaler + // (.metadata.generation plus watched labels/annotations), so that a single + // equality check captures both spec changes and out-of-spec edits. + metadataHash uint64 + // scalingValues represents the active scaling values that should be used scalingValues ScalingValues @@ -293,8 +311,17 @@ func (p *PodAutoscalerInternal) UpdateFromProfile( p.horizontalEventsRetention, p.horizontalRecommendationsRetention = getHorizontalRetentionValues(dpaSpec.ApplyPolicy) } -// UpdateFromPodAutoscaler updates the PodAutoscalerInternal from a PodAutoscaler object inside K8S +// UpdateFromPodAutoscaler updates the PodAutoscalerInternal from a PodAutoscaler object inside K8S. +// For local-owner DPAs it short-circuits when the cached metadata fingerprint is unchanged. func (p *PodAutoscalerInternal) UpdateFromPodAutoscaler(podAutoscaler *datadoghq.DatadogPodAutoscaler) { + if podAutoscaler.Spec.Owner == datadoghqcommon.DatadogPodAutoscalerLocalOwner { + newHash := computePodAutoscalerMetadataHash(podAutoscaler) + if p.metadataHash == newHash { + return + } + p.metadataHash = newHash + } + if v, ok := podAutoscaler.Labels[ProfileLabelKey]; ok { p.profileName = v } @@ -1340,3 +1367,18 @@ func parseCustomConfigurationAnnotation(annotations map[string]string) (*Recomme return &customConfiguration, nil } + +// computePodAutoscalerMetadataHash returns a fingerprint of the K8s state +// read by UpdateFromPodAutoscaler — .metadata.generation plus the watched +// labels/annotations — used to identify any change worth re-syncing. +func computePodAutoscalerMetadataHash(podAutoscaler *datadoghq.DatadogPodAutoscaler) uint64 { + hasher := murmur3.New64() + _, _ = fmt.Fprintf(hasher, "G\x00%d\x00", podAutoscaler.Generation) + for _, key := range resyncLabelKeysFromPodAutoscaler { + _, _ = hasher.Write([]byte("L\x00" + key + "\x00" + podAutoscaler.Labels[key] + "\x00")) + } + for _, key := range resyncAnnotationKeysFromPodAutoscaler { + _, _ = hasher.Write([]byte("A\x00" + key + "\x00" + podAutoscaler.Annotations[key] + "\x00")) + } + return hasher.Sum64() +} diff --git a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test.go b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test.go index 3785c44aa6a6..f8b05c7b27db 100644 --- a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test.go +++ b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test.go @@ -847,3 +847,31 @@ func TestContainerResourcesForStatus(t *testing.T) { }) } } + +// TestUpdateFromPodAutoscalerResyncsOnWatchedMetadata verifies that, for local-owner DPAs, +// UpdateFromPodAutoscaler picks up changes to the watched labels/annotations even when +// .metadata.generation is unchanged (e.g. an annotation-only edit to PreviewAnnotationKey). +func TestUpdateFromPodAutoscalerResyncsOnWatchedMetadata(t *testing.T) { + dpa := &datadoghq.DatadogPodAutoscaler{ + ObjectMeta: metav1.ObjectMeta{Name: "dpa", Namespace: "default", Generation: 1}, + Spec: datadoghq.DatadogPodAutoscalerSpec{Owner: datadoghqcommon.DatadogPodAutoscalerLocalOwner}, + } + + pai := NewPodAutoscalerInternal(dpa) + assert.False(t, pai.IsBurstable()) + + // Annotation-only edit: same generation, new preview annotation. + dpa.Annotations = map[string]string{PreviewAnnotationKey: `{"burstable":true}`} + pai.UpdateFromPodAutoscaler(dpa) + assert.True(t, pai.IsBurstable(), "annotation-only edit must be picked up") + + // Calling again with the same object is a no-op (gate kicks in). + previousHash := pai.metadataHash + pai.UpdateFromPodAutoscaler(dpa) + assert.Equal(t, previousHash, pai.metadataHash) + + // An unrelated annotation change does not retrigger the parse but is harmless. + dpa.Annotations["unrelated"] = "x" + pai.UpdateFromPodAutoscaler(dpa) + assert.True(t, pai.IsBurstable()) +} diff --git a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test_utils.go b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test_utils.go index c5f6a24ca9ab..31616bcedcfa 100644 --- a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test_utils.go +++ b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test_utils.go @@ -103,11 +103,21 @@ func (f FakePodAutoscalerInternal) Build() PodAutoscalerInternal { upstreamCR.Annotations[PreviewAnnotationKey] = f.PreviewAnnotationKey } + // Mirror UpdateFromPodAutoscaler: cache the metadata fingerprint so equality checks + // against a PodAutoscalerInternal that came through the real code path match. + // Only populate when the test passes an explicit local-owner UpstreamCR — settings-driven + // shells (and non-local owners) do not go through the gated path in production. + var metadataHash uint64 + if f.UpstreamCR != nil && f.UpstreamCR.Spec.Owner == datadoghqcommon.DatadogPodAutoscalerLocalOwner { + metadataHash = computePodAutoscalerMetadataHash(upstreamCR) + } + return PodAutoscalerInternal{ namespace: f.Namespace, name: f.Name, generation: f.Generation, upstreamCR: upstreamCR, + metadataHash: metadataHash, settingsTimestamp: f.SettingsTimestamp, creationTimestamp: f.CreationTimestamp, scalingValues: f.ScalingValues, diff --git a/releasenotes/notes/fix-dpa-annotation-resync-40deef59a1cde0aa.yaml b/releasenotes/notes/fix-dpa-annotation-resync-40deef59a1cde0aa.yaml new file mode 100644 index 000000000000..a1ec8797da2b --- /dev/null +++ b/releasenotes/notes/fix-dpa-annotation-resync-40deef59a1cde0aa.yaml @@ -0,0 +1,9 @@ +--- +fixes: + - | + Fix a bug in the workload autoscaling controller where annotation-only edits + (e.g. ``autoscaling.datadoghq.com/preview``) on a locally-owned + ``DatadogPodAutoscaler`` were not picked up until the next ``.spec`` change or + cluster-agent restart, because the controller gated re-sync on + ``.metadata.generation`` (which annotations do not bump). Toggling burstable + mode via the preview annotation now takes effect on the next reconcile.