From f4cc0f777b4a998aefb8ae341f4e8f5524f4fa1f Mon Sep 17 00:00:00 2001 From: Malte Viering Date: Wed, 3 Jun 2026 11:59:52 +0000 Subject: [PATCH] fix(failover): keep allocation when VM missing from VM source but still on hypervisor When the VM source (postgres) is missing data but VMs are still active on hypervisors (e.g. after a postgres data loss), the failover controller previously treated every allocated VM as 'no longer exists' and wiped the allocations. Empty reservations were then deleted, causing total loss of failover coverage despite the VMs still running. Cross-check hv1.HypervisorList.Status.Instances before removing a VM from allocations in reconcileRemoveInvalidVMFromReservations. If the VM is still reported active on any known hypervisor, keep the allocation and log a warning. The hypervisor-mismatch branch is intentionally left unchanged: it is already governed by the TrustHypervisorLocation flag. Adds unit tests covering: - VM missing from VM source but still on hypervisor (allocation kept) - VM source completely empty while VMs still on hypervisors (all reservations preserved -- the exact incident scenario) - Mixed: postgres-missing-but-on-HV vs. truly gone (only truly gone removed) - Hypervisor mismatch is not suppressed by the safeguard --- .../reservations/failover/controller.go | 44 ++++++- .../reservations/failover/controller_test.go | 109 +++++++++++++++++- 2 files changed, 148 insertions(+), 5 deletions(-) diff --git a/internal/scheduling/reservations/failover/controller.go b/internal/scheduling/reservations/failover/controller.go index c66295293..348ba0dca 100644 --- a/internal/scheduling/reservations/failover/controller.go +++ b/internal/scheduling/reservations/failover/controller.go @@ -268,6 +268,28 @@ func (c *FailoverReservationController) ReconcilePeriodic(ctx context.Context) ( allHypervisors = append(allHypervisors, hv.Name) } + // Build a set of VM UUIDs currently active on any known hypervisor. + // This is used as a safeguard against removing failover allocations when the + // VM source (postgres) is missing data but the VM is still alive on a + // hypervisor (e.g. after a postgres data loss / restore). Without this + // cross-check a wiped postgres would cause every failover reservation to be + // emptied and then deleted. + vmsOnHypervisor := make(map[string]string) + for _, hv := range hypervisorList.Items { + for _, inst := range hv.Status.Instances { + if !inst.Active { + continue + } + if _, exists := vmsOnHypervisor[inst.ID]; exists { + // VM appears on multiple hypervisors (transient during live + // migration). Keep the first occurrence; the safeguard only + // needs to know it exists somewhere. + continue + } + vmsOnHypervisor[inst.ID] = hv.Name + } + } + // 2. Get all VMs that might need failover reservations vms, err := c.VMSource.ListVMsOnHypervisors(ctx, &hypervisorList, c.Config.TrustHypervisorLocation) if err != nil { @@ -289,7 +311,7 @@ func (c *FailoverReservationController) ReconcilePeriodic(ctx context.Context) ( logger.V(1).Info("found failover reservations", "count", len(failoverReservations)) // 3. Remove VMs from reservations if they are no longer valid - failoverReservations, reservationsToUpdate := reconcileRemoveInvalidVMFromReservations(ctx, vms, failoverReservations) + failoverReservations, reservationsToUpdate := reconcileRemoveInvalidVMFromReservations(ctx, vms, vmsOnHypervisor, failoverReservations) for _, res := range reservationsToUpdate { if err := c.patchReservationStatus(ctx, res); err != nil { @@ -381,9 +403,17 @@ func (c *FailoverReservationController) ReconcilePeriodic(ctx context.Context) ( // - The VM has moved to a different host // Returns the updated list of reservations (with modifications applied in-memory). // The caller is responsible for persisting any changes to the cluster. +// +// vmsOnHypervisor maps VM UUID -> hypervisor name for every VM currently active +// on any known hypervisor (sourced from hv1.HypervisorList.Status.Instances). +// It is used as a safeguard: if a VM is missing from the postgres-derived vms +// slice but is still present on a hypervisor, we keep the allocation. This +// protects failover reservations from being wiped by transient/total postgres +// data loss. func reconcileRemoveInvalidVMFromReservations( ctx context.Context, vms []VM, + vmsOnHypervisor map[string]string, failoverReservations []v1alpha1.Reservation, ) (updatedReservations []v1alpha1.Reservation, reservationsToUpdate []*v1alpha1.Reservation) { @@ -404,6 +434,18 @@ func reconcileRemoveInvalidVMFromReservations( for vmUUID, allocatedHypervisor := range allocations { vmCurrentHypervisor, vmExists := vmToHypervisor[vmUUID] if !vmExists { + // Safeguard: if the VM is missing from the VM source (e.g. + // postgres) but is still reported active on a hypervisor by + // the hypervisor operator CRD, keep the allocation. This + // prevents a postgres data loss from cascading into a mass + // deletion of failover reservations. + if hv, stillOnHV := vmsOnHypervisor[vmUUID]; stillOnHV { + logger.Info("keeping VM allocation despite missing from VM source: still active on hypervisor", + "vmUUID", vmUUID, "reservation", res.Name, + "allocatedHypervisor", allocatedHypervisor, "hypervisor", hv) + updatedAllocations[vmUUID] = allocatedHypervisor + continue + } logger.Info("removing VM from reservation allocations because VM no longer exists", "vmUUID", vmUUID, "reservation", res.Name) needsUpdate = true diff --git a/internal/scheduling/reservations/failover/controller_test.go b/internal/scheduling/reservations/failover/controller_test.go index 78d3168c7..aa91c42ce 100644 --- a/internal/scheduling/reservations/failover/controller_test.go +++ b/internal/scheduling/reservations/failover/controller_test.go @@ -577,6 +577,7 @@ func TestReconcileRemoveInvalidVMFromReservations(t *testing.T) { tests := []struct { name string vms []VM + vmsOnHypervisor map[string]string reservations []v1alpha1.Reservation expectedUpdatedCount int // number of reservations in updatedReservations expectedToUpdateCount int // number of reservations that need cluster update @@ -604,7 +605,7 @@ func TestReconcileRemoveInvalidVMFromReservations(t *testing.T) { name: "VM no longer exists - remove from allocations", vms: []VM{ newTestVM("vm-1", "host1", "flavor1"), - // vm-2 no longer exists + // vm-2 no longer exists (and not on any hypervisor) }, reservations: []v1alpha1.Reservation{ newTestReservation("res-1", "host3", map[string]string{ @@ -641,7 +642,7 @@ func TestReconcileRemoveInvalidVMFromReservations(t *testing.T) { vms: []VM{ newTestVM("vm-1", "host1", "flavor1"), newTestVM("vm-2", "host2", "flavor1"), - // vm-3 no longer exists + // vm-3 no longer exists (and not on any hypervisor) }, reservations: []v1alpha1.Reservation{ newTestReservation("res-1", "host3", map[string]string{ @@ -662,7 +663,7 @@ func TestReconcileRemoveInvalidVMFromReservations(t *testing.T) { { name: "all VMs removed from reservation - empty allocations", vms: []VM{ - // no VMs exist + // no VMs exist (and not on any hypervisor) }, reservations: []v1alpha1.Reservation{ newTestReservation("res-1", "host3", map[string]string{ @@ -705,7 +706,7 @@ func TestReconcileRemoveInvalidVMFromReservations(t *testing.T) { vms: []VM{ newTestVM("vm-1", "host1", "flavor1"), // valid newTestVM("vm-2", "host5", "flavor1"), // moved from host2 to host5 - // vm-3 deleted + // vm-3 deleted (and not on any hypervisor) newTestVM("vm-4", "host4", "flavor1"), // valid }, reservations: []v1alpha1.Reservation{ @@ -725,6 +726,105 @@ func TestReconcileRemoveInvalidVMFromReservations(t *testing.T) { "res-2": {"vm-4": "host4"}, }, }, + // ==================================================================== + // Safeguard: VM missing from VM source but still on a hypervisor + // (e.g. postgres data loss). Allocations must be preserved. + // ==================================================================== + { + name: "safeguard: VM missing from VM source but still on hypervisor - keep allocation", + vms: []VM{ + newTestVM("vm-1", "host1", "flavor1"), + // vm-2 missing from VM source (postgres lost data) + }, + vmsOnHypervisor: map[string]string{ + "vm-1": "host1", + "vm-2": "host2", // still alive on hypervisor + }, + reservations: []v1alpha1.Reservation{ + newTestReservation("res-1", "host3", map[string]string{ + "vm-1": "host1", + "vm-2": "host2", + }), + }, + expectedUpdatedCount: 1, + expectedToUpdateCount: 0, // safeguard prevents the update + expectedAllocationsPerRes: map[string]map[string]string{ + "res-1": {"vm-1": "host1", "vm-2": "host2"}, + }, + }, + { + name: "safeguard: VM source completely empty but VMs still on hypervisors - keep all allocations", + vms: []VM{ + // VM source returns nothing (postgres wiped) + }, + vmsOnHypervisor: map[string]string{ + "vm-1": "host1", + "vm-2": "host2", + "vm-3": "host3", + }, + reservations: []v1alpha1.Reservation{ + newTestReservation("res-1", "host4", map[string]string{ + "vm-1": "host1", + "vm-2": "host2", + }), + newTestReservation("res-2", "host5", map[string]string{ + "vm-3": "host3", + }), + }, + expectedUpdatedCount: 2, + expectedToUpdateCount: 0, // safeguard prevents both updates + expectedAllocationsPerRes: map[string]map[string]string{ + "res-1": {"vm-1": "host1", "vm-2": "host2"}, + "res-2": {"vm-3": "host3"}, + }, + }, + { + name: "safeguard: only some missing VMs are still on hypervisor - remove only fully gone ones", + vms: []VM{ + newTestVM("vm-1", "host1", "flavor1"), + // vm-2 missing from postgres but on hypervisor + // vm-3 missing from both + }, + vmsOnHypervisor: map[string]string{ + "vm-1": "host1", + "vm-2": "host2", + // vm-3 not on any hypervisor + }, + reservations: []v1alpha1.Reservation{ + newTestReservation("res-1", "host4", map[string]string{ + "vm-1": "host1", + "vm-2": "host2", // safeguarded + "vm-3": "host3", // truly gone - remove + }), + }, + expectedUpdatedCount: 1, + expectedToUpdateCount: 1, + expectedAllocationsPerRes: map[string]map[string]string{ + "res-1": {"vm-1": "host1", "vm-2": "host2"}, + }, + }, + { + name: "safeguard does not protect against hypervisor mismatch (VM is in postgres on different host)", + vms: []VM{ + newTestVM("vm-1", "host1", "flavor1"), + newTestVM("vm-2", "host5", "flavor1"), // moved to host5 in postgres + }, + vmsOnHypervisor: map[string]string{ + "vm-1": "host1", + "vm-2": "host5", + }, + reservations: []v1alpha1.Reservation{ + newTestReservation("res-1", "host3", map[string]string{ + "vm-1": "host1", + "vm-2": "host2", // host mismatch - should still be removed + }), + }, + expectedUpdatedCount: 1, + expectedToUpdateCount: 1, + expectedAllocationsPerRes: map[string]map[string]string{ + "res-1": {"vm-1": "host1"}, + }, + }, } for _, tt := range tests { @@ -733,6 +833,7 @@ func TestReconcileRemoveInvalidVMFromReservations(t *testing.T) { updatedReservations, reservationsToUpdate := reconcileRemoveInvalidVMFromReservations( ctx, tt.vms, + tt.vmsOnHypervisor, tt.reservations, )