diff --git a/internal/scheduling/reservations/failover/controller.go b/internal/scheduling/reservations/failover/controller.go index c66295293..348ba0dca 100644 --- a/internal/scheduling/reservations/failover/controller.go +++ b/internal/scheduling/reservations/failover/controller.go @@ -268,6 +268,28 @@ func (c *FailoverReservationController) ReconcilePeriodic(ctx context.Context) ( allHypervisors = append(allHypervisors, hv.Name) } + // Build a set of VM UUIDs currently active on any known hypervisor. + // This is used as a safeguard against removing failover allocations when the + // VM source (postgres) is missing data but the VM is still alive on a + // hypervisor (e.g. after a postgres data loss / restore). Without this + // cross-check a wiped postgres would cause every failover reservation to be + // emptied and then deleted. + vmsOnHypervisor := make(map[string]string) + for _, hv := range hypervisorList.Items { + for _, inst := range hv.Status.Instances { + if !inst.Active { + continue + } + if _, exists := vmsOnHypervisor[inst.ID]; exists { + // VM appears on multiple hypervisors (transient during live + // migration). Keep the first occurrence; the safeguard only + // needs to know it exists somewhere. + continue + } + vmsOnHypervisor[inst.ID] = hv.Name + } + } + // 2. Get all VMs that might need failover reservations vms, err := c.VMSource.ListVMsOnHypervisors(ctx, &hypervisorList, c.Config.TrustHypervisorLocation) if err != nil { @@ -289,7 +311,7 @@ func (c *FailoverReservationController) ReconcilePeriodic(ctx context.Context) ( logger.V(1).Info("found failover reservations", "count", len(failoverReservations)) // 3. Remove VMs from reservations if they are no longer valid - failoverReservations, reservationsToUpdate := reconcileRemoveInvalidVMFromReservations(ctx, vms, failoverReservations) + failoverReservations, reservationsToUpdate := reconcileRemoveInvalidVMFromReservations(ctx, vms, vmsOnHypervisor, failoverReservations) for _, res := range reservationsToUpdate { if err := c.patchReservationStatus(ctx, res); err != nil { @@ -381,9 +403,17 @@ func (c *FailoverReservationController) ReconcilePeriodic(ctx context.Context) ( // - The VM has moved to a different host // Returns the updated list of reservations (with modifications applied in-memory). // The caller is responsible for persisting any changes to the cluster. +// +// vmsOnHypervisor maps VM UUID -> hypervisor name for every VM currently active +// on any known hypervisor (sourced from hv1.HypervisorList.Status.Instances). +// It is used as a safeguard: if a VM is missing from the postgres-derived vms +// slice but is still present on a hypervisor, we keep the allocation. This +// protects failover reservations from being wiped by transient/total postgres +// data loss. func reconcileRemoveInvalidVMFromReservations( ctx context.Context, vms []VM, + vmsOnHypervisor map[string]string, failoverReservations []v1alpha1.Reservation, ) (updatedReservations []v1alpha1.Reservation, reservationsToUpdate []*v1alpha1.Reservation) { @@ -404,6 +434,18 @@ func reconcileRemoveInvalidVMFromReservations( for vmUUID, allocatedHypervisor := range allocations { vmCurrentHypervisor, vmExists := vmToHypervisor[vmUUID] if !vmExists { + // Safeguard: if the VM is missing from the VM source (e.g. + // postgres) but is still reported active on a hypervisor by + // the hypervisor operator CRD, keep the allocation. This + // prevents a postgres data loss from cascading into a mass + // deletion of failover reservations. + if hv, stillOnHV := vmsOnHypervisor[vmUUID]; stillOnHV { + logger.Info("keeping VM allocation despite missing from VM source: still active on hypervisor", + "vmUUID", vmUUID, "reservation", res.Name, + "allocatedHypervisor", allocatedHypervisor, "hypervisor", hv) + updatedAllocations[vmUUID] = allocatedHypervisor + continue + } logger.Info("removing VM from reservation allocations because VM no longer exists", "vmUUID", vmUUID, "reservation", res.Name) needsUpdate = true diff --git a/internal/scheduling/reservations/failover/controller_test.go b/internal/scheduling/reservations/failover/controller_test.go index 78d3168c7..aa91c42ce 100644 --- a/internal/scheduling/reservations/failover/controller_test.go +++ b/internal/scheduling/reservations/failover/controller_test.go @@ -577,6 +577,7 @@ func TestReconcileRemoveInvalidVMFromReservations(t *testing.T) { tests := []struct { name string vms []VM + vmsOnHypervisor map[string]string reservations []v1alpha1.Reservation expectedUpdatedCount int // number of reservations in updatedReservations expectedToUpdateCount int // number of reservations that need cluster update @@ -604,7 +605,7 @@ func TestReconcileRemoveInvalidVMFromReservations(t *testing.T) { name: "VM no longer exists - remove from allocations", vms: []VM{ newTestVM("vm-1", "host1", "flavor1"), - // vm-2 no longer exists + // vm-2 no longer exists (and not on any hypervisor) }, reservations: []v1alpha1.Reservation{ newTestReservation("res-1", "host3", map[string]string{ @@ -641,7 +642,7 @@ func TestReconcileRemoveInvalidVMFromReservations(t *testing.T) { vms: []VM{ newTestVM("vm-1", "host1", "flavor1"), newTestVM("vm-2", "host2", "flavor1"), - // vm-3 no longer exists + // vm-3 no longer exists (and not on any hypervisor) }, reservations: []v1alpha1.Reservation{ newTestReservation("res-1", "host3", map[string]string{ @@ -662,7 +663,7 @@ func TestReconcileRemoveInvalidVMFromReservations(t *testing.T) { { name: "all VMs removed from reservation - empty allocations", vms: []VM{ - // no VMs exist + // no VMs exist (and not on any hypervisor) }, reservations: []v1alpha1.Reservation{ newTestReservation("res-1", "host3", map[string]string{ @@ -705,7 +706,7 @@ func TestReconcileRemoveInvalidVMFromReservations(t *testing.T) { vms: []VM{ newTestVM("vm-1", "host1", "flavor1"), // valid newTestVM("vm-2", "host5", "flavor1"), // moved from host2 to host5 - // vm-3 deleted + // vm-3 deleted (and not on any hypervisor) newTestVM("vm-4", "host4", "flavor1"), // valid }, reservations: []v1alpha1.Reservation{ @@ -725,6 +726,105 @@ func TestReconcileRemoveInvalidVMFromReservations(t *testing.T) { "res-2": {"vm-4": "host4"}, }, }, + // ==================================================================== + // Safeguard: VM missing from VM source but still on a hypervisor + // (e.g. postgres data loss). Allocations must be preserved. + // ==================================================================== + { + name: "safeguard: VM missing from VM source but still on hypervisor - keep allocation", + vms: []VM{ + newTestVM("vm-1", "host1", "flavor1"), + // vm-2 missing from VM source (postgres lost data) + }, + vmsOnHypervisor: map[string]string{ + "vm-1": "host1", + "vm-2": "host2", // still alive on hypervisor + }, + reservations: []v1alpha1.Reservation{ + newTestReservation("res-1", "host3", map[string]string{ + "vm-1": "host1", + "vm-2": "host2", + }), + }, + expectedUpdatedCount: 1, + expectedToUpdateCount: 0, // safeguard prevents the update + expectedAllocationsPerRes: map[string]map[string]string{ + "res-1": {"vm-1": "host1", "vm-2": "host2"}, + }, + }, + { + name: "safeguard: VM source completely empty but VMs still on hypervisors - keep all allocations", + vms: []VM{ + // VM source returns nothing (postgres wiped) + }, + vmsOnHypervisor: map[string]string{ + "vm-1": "host1", + "vm-2": "host2", + "vm-3": "host3", + }, + reservations: []v1alpha1.Reservation{ + newTestReservation("res-1", "host4", map[string]string{ + "vm-1": "host1", + "vm-2": "host2", + }), + newTestReservation("res-2", "host5", map[string]string{ + "vm-3": "host3", + }), + }, + expectedUpdatedCount: 2, + expectedToUpdateCount: 0, // safeguard prevents both updates + expectedAllocationsPerRes: map[string]map[string]string{ + "res-1": {"vm-1": "host1", "vm-2": "host2"}, + "res-2": {"vm-3": "host3"}, + }, + }, + { + name: "safeguard: only some missing VMs are still on hypervisor - remove only fully gone ones", + vms: []VM{ + newTestVM("vm-1", "host1", "flavor1"), + // vm-2 missing from postgres but on hypervisor + // vm-3 missing from both + }, + vmsOnHypervisor: map[string]string{ + "vm-1": "host1", + "vm-2": "host2", + // vm-3 not on any hypervisor + }, + reservations: []v1alpha1.Reservation{ + newTestReservation("res-1", "host4", map[string]string{ + "vm-1": "host1", + "vm-2": "host2", // safeguarded + "vm-3": "host3", // truly gone - remove + }), + }, + expectedUpdatedCount: 1, + expectedToUpdateCount: 1, + expectedAllocationsPerRes: map[string]map[string]string{ + "res-1": {"vm-1": "host1", "vm-2": "host2"}, + }, + }, + { + name: "safeguard does not protect against hypervisor mismatch (VM is in postgres on different host)", + vms: []VM{ + newTestVM("vm-1", "host1", "flavor1"), + newTestVM("vm-2", "host5", "flavor1"), // moved to host5 in postgres + }, + vmsOnHypervisor: map[string]string{ + "vm-1": "host1", + "vm-2": "host5", + }, + reservations: []v1alpha1.Reservation{ + newTestReservation("res-1", "host3", map[string]string{ + "vm-1": "host1", + "vm-2": "host2", // host mismatch - should still be removed + }), + }, + expectedUpdatedCount: 1, + expectedToUpdateCount: 1, + expectedAllocationsPerRes: map[string]map[string]string{ + "res-1": {"vm-1": "host1"}, + }, + }, } for _, tt := range tests { @@ -733,6 +833,7 @@ func TestReconcileRemoveInvalidVMFromReservations(t *testing.T) { updatedReservations, reservationsToUpdate := reconcileRemoveInvalidVMFromReservations( ctx, tt.vms, + tt.vmsOnHypervisor, tt.reservations, )