From 15aeaa0367990103cdf9b4280de77f17f264b77e Mon Sep 17 00:00:00 2001 From: Andrew Karpow Date: Wed, 20 May 2026 11:36:30 -0400 Subject: [PATCH] Fix eviction overwriting failed migration condition with success When an eviction processes a list of VMs and one is in ERROR state, the controller correctly skips it via moveToBack and sets a Failed migration condition. However, the next successful migration of another VM unconditionally overwrote that condition with Succeeded, hiding the fact that an errored VM was still outstanding. Preserve the Failed condition while there are still outstanding VMs. The condition resolves naturally: either the errored VM is fixed and migrates (overwriting with Succeeded), or the eviction terminates with Evicting=Succeeded once the list empties. Adds a unit test covering a mixed list of healthy and errored VMs that exercises the skip-and-retry pattern across multiple reconciliations. --- internal/controller/eviction_controller.go | 24 ++- .../controller/eviction_controller_test.go | 160 ++++++++++++++++++ 2 files changed, 178 insertions(+), 6 deletions(-) diff --git a/internal/controller/eviction_controller.go b/internal/controller/eviction_controller.go index ae502fa1..4a766292 100644 --- a/internal/controller/eviction_controller.go +++ b/internal/controller/eviction_controller.go @@ -326,12 +326,24 @@ func (r *EvictionReconciler) evictNext(ctx context.Context, eviction *kvmv1.Evic if currentHypervisor != eviction.Spec.Hypervisor { log.Info("migrated") - meta.SetStatusCondition(&eviction.Status.Conditions, metav1.Condition{ - Type: kvmv1.ConditionTypeMigration, - Status: metav1.ConditionFalse, - Message: fmt.Sprintf("Migration of instance %s finished", vm.ID), - Reason: kvmv1.ConditionReasonSucceeded, - }) + // Don't overwrite a sticky Failed migration condition with Succeeded + // while there are still other outstanding VMs - an earlier ERROR VM + // has been moved to the back of the queue and the eviction is not + // actually clean. The condition is reset only when the whole + // eviction completes (OutstandingInstances becomes empty). + remaining, _ := popInstance(eviction.Status.OutstandingInstances) + prior := meta.FindStatusCondition(eviction.Status.Conditions, kvmv1.ConditionTypeMigration) + stickyFailure := len(remaining) > 0 && prior != nil && + prior.Status == metav1.ConditionFalse && + prior.Reason == kvmv1.ConditionReasonFailed + if !stickyFailure { + meta.SetStatusCondition(&eviction.Status.Conditions, metav1.Condition{ + Type: kvmv1.ConditionTypeMigration, + Status: metav1.ConditionFalse, + Message: fmt.Sprintf("Migration of instance %s finished", vm.ID), + Reason: kvmv1.ConditionReasonSucceeded, + }) + } // So, it is already off this one, do we need to verify it? if vm.Status == "VERIFY_RESIZE" { diff --git a/internal/controller/eviction_controller_test.go b/internal/controller/eviction_controller_test.go index 85606ee5..5ef315b2 100644 --- a/internal/controller/eviction_controller_test.go +++ b/internal/controller/eviction_controller_test.go @@ -439,5 +439,165 @@ var _ = Describe("Eviction Controller", func() { }) }) }) + + Context("Mixed VM Eviction", func() { + // serverTpl renders a single server response. The eviction controller + // reads OS-EXT-SRV-ATTR:hypervisor_hostname (compared against the + // short-form hypervisor name from spec) plus status/task_state/power_state. + const serverTpl = `{ + "server": { + "id": "%[1]s", + "status": "%[2]s", + "OS-EXT-SRV-ATTR:hypervisor_hostname": "%[3]s", + "OS-EXT-STS:task_state": "%[4]s", + "OS-EXT-STS:power_state": %[5]d, + "fault": {"code": 500, "message": "%[6]s"} + } +}` + + // migratedVMs is updated as the test simulates successful migrations. + // When a VM has been "migrated", its hypervisor_hostname response + // changes to a different host, signalling the controller it has left. + var migratedVMs map[string]bool + var liveMigrateCalls map[string]int + + BeforeEach(func(ctx SpecContext) { + migratedVMs = map[string]bool{} + liveMigrateCalls = map[string]int{} + + By("Seeding the eviction status with a list of VMs to evict") + // OutstandingInstances is processed from the END (peekInstance returns + // last). With [good-1, error-1, good-2], processing order is: + // 1) good-2 (last) - migrate, then drop + // 2) error-1 (now last) - skipped via moveToBack + // 3) good-1 - migrate, then drop + // 4) error-1 (alone) - keeps erroring + eviction := &kvmv1.Eviction{} + Expect(k8sClient.Get(ctx, typeNamespacedName, eviction)).To(Succeed()) + meta.SetStatusCondition(&eviction.Status.Conditions, metav1.Condition{ + Type: kvmv1.ConditionTypeEvicting, + Status: metav1.ConditionTrue, + Message: "Running", + Reason: kvmv1.ConditionReasonRunning, + }) + meta.SetStatusCondition(&eviction.Status.Conditions, metav1.Condition{ + Type: kvmv1.ConditionTypePreflight, + Status: metav1.ConditionTrue, + Message: "preflight passed", + Reason: kvmv1.ConditionReasonSucceeded, + }) + eviction.Status.HypervisorServiceId = serviceId + eviction.Status.OutstandingInstances = []string{"good-1", "error-1", "good-2"} + eviction.Status.OutstandingRamMb = 4096 + Expect(k8sClient.Status().Update(ctx, eviction)).To(Succeed()) + + By("Mocking GET /servers/{id} to return per-VM state") + fakeServer.Mux.HandleFunc("GET /servers/{server_id}", func(w http.ResponseWriter, r *http.Request) { + serverID := r.PathValue("server_id") + w.Header().Add("Content-Type", "application/json") + + // hypervisor_hostname uses the FQDN-style name; the controller + // only compares the short prefix (before the first ".") against + // eviction.Spec.Hypervisor. After we mark a VM as migrated, + // pretend it lives on a different host so the controller treats + // it as "already moved" and removes it from the list. + hvHost := hypervisorName + ".example.local" + if migratedVMs[serverID] { + hvHost = "other-host.example.local" + } + + switch serverID { + case "good-1", "good-2": + status := "ACTIVE" + if migratedVMs[serverID] { + // Once migrated, status doesn't really matter, but + // keep it ACTIVE so we exercise the "different host" + // branch rather than VERIFY_RESIZE. + status = "ACTIVE" + } + w.WriteHeader(http.StatusOK) + _, err := fmt.Fprintf(w, serverTpl, serverID, status, hvHost, "", 1, "") + Expect(err).NotTo(HaveOccurred()) + case "error-1": + w.WriteHeader(http.StatusOK) + _, err := fmt.Fprintf(w, serverTpl, serverID, "ERROR", hvHost, "", 0, + "manual intervention required") + Expect(err).NotTo(HaveOccurred()) + default: + Fail("unexpected server id: " + serverID) + } + }) + + By("Mocking POST /servers/{id}/action for live-migration") + fakeServer.Mux.HandleFunc("POST /servers/{server_id}/action", func(w http.ResponseWriter, r *http.Request) { + serverID := r.PathValue("server_id") + liveMigrateCalls[serverID]++ + // Mark this VM as migrated so the next GET reports a different host. + migratedVMs[serverID] = true + w.WriteHeader(http.StatusAccepted) + }) + }) + + It("skips errored VMs, evicts healthy ones, and retries errored VMs in subsequent loops", func(ctx SpecContext) { + resource := &kvmv1.Eviction{} + + // Reconcile loop until the list is empty or we've gone too long. + // We expect: good-2 migrated, error-1 skipped, good-1 migrated, then + // only error-1 remains and keeps erroring. + By("Running reconciliations until only the errored VM remains") + const maxLoops = 20 + for i := range maxLoops { + // Tolerate errors here: when the controller hits the ERROR + // VM it returns an error (joined with the status update). + // That is part of the pattern under test, not a test failure. + _, reconcileErr := evictionReconciler.Reconcile(ctx, reconcileRequest) + _ = reconcileErr // expected on ERROR-VM iterations + Expect(k8sClient.Get(ctx, typeNamespacedName, resource)).To(Succeed()) + + // Once both healthy VMs have been migrated and removed, we are + // in the steady "only errored VM left" state. + remaining := resource.Status.OutstandingInstances + if len(remaining) == 1 && remaining[0] == "error-1" { + By(fmt.Sprintf("Reached steady state after %d reconciliations", i+1)) + break + } + } + + By("Both healthy VMs were live-migrated exactly once") + Expect(liveMigrateCalls["good-1"]).To(Equal(1), + "good-1 should have been migrated once") + Expect(liveMigrateCalls["good-2"]).To(Equal(1), + "good-2 should have been migrated once") + + By("The errored VM is still outstanding and never received a migrate call") + Expect(resource.Status.OutstandingInstances).To(Equal([]string{"error-1"})) + Expect(liveMigrateCalls).NotTo(HaveKey("error-1")) + + By("The migration condition reflects the most recent failure") + Expect(resource.Status.Conditions).To(ContainElement(SatisfyAll( + HaveField("Type", kvmv1.ConditionTypeMigration), + HaveField("Status", metav1.ConditionFalse), + HaveField("Reason", kvmv1.ConditionReasonFailed), + HaveField("Message", ContainSubstring("error-1")), + ))) + + By("The eviction is NOT marked successful while the errored VM remains") + Expect(resource.Status.Conditions).NotTo(ContainElement(SatisfyAll( + HaveField("Type", kvmv1.ConditionTypeEvicting), + HaveField("Status", metav1.ConditionFalse), + HaveField("Reason", kvmv1.ConditionReasonSucceeded), + ))) + + By("Subsequent reconciliations keep retrying the errored VM (and surfacing the error)") + _, err := evictionReconciler.Reconcile(ctx, reconcileRequest) + // The controller returns an error when it encounters a VM in ERROR + // state. The reconcile error should mention the errored UUID. + if err != nil { + Expect(err.Error()).To(ContainSubstring("error-1")) + } + Expect(k8sClient.Get(ctx, typeNamespacedName, resource)).To(Succeed()) + Expect(resource.Status.OutstandingInstances).To(Equal([]string{"error-1"})) + }) + }) }) })