From 30a54d0ecc4e155ff89052e72e86c25960b6e879 Mon Sep 17 00:00:00 2001 From: James Peru Date: Tue, 17 Mar 2026 06:54:24 +0300 Subject: [PATCH] NAS backup: resume paused VM on backup failure and fix missing exit When a NAS backup job fails (e.g. due to backup storage being full or I/O errors), the VM may remain indefinitely paused because: 1. The cleanup() function never checks or resumes the VM's paused state that was set by virsh backup-begin during the push backup operation. 2. The 'Failed' case in the backup job monitoring loop calls cleanup() but lacks an 'exit' statement, causing an infinite loop where the script repeatedly detects the failed job and calls cleanup(). 3. Similarly, backup_stopped_vm() calls cleanup() on qemu-img convert failure but does not exit, allowing the loop to continue with subsequent disks despite the failure. This fix: - Adds VM state detection and resume to cleanup(), ensuring the VM is always resumed if found in a paused state during error handling - Adds missing 'exit 1' after cleanup() in the Failed backup job case to prevent the infinite monitoring loop - Adds missing 'exit 1' after cleanup() in backup_stopped_vm() on qemu-img convert failure Co-Authored-By: Claude Opus 4.6 --- scripts/vm/hypervisor/kvm/nasbackup.sh | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/scripts/vm/hypervisor/kvm/nasbackup.sh b/scripts/vm/hypervisor/kvm/nasbackup.sh index 7f4a4b621929..de0955239791 100755 --- a/scripts/vm/hypervisor/kvm/nasbackup.sh +++ b/scripts/vm/hypervisor/kvm/nasbackup.sh @@ -142,7 +142,8 @@ backup_running_vm() { break ;; Failed) echo "Virsh backup job failed" - cleanup ;; + cleanup + exit 1 ;; esac sleep 5 done @@ -178,6 +179,7 @@ backup_stopped_vm() { if ! qemu-img convert -O qcow2 "$disk" "$output" > "$logFile" 2> >(cat >&2); then echo "qemu-img convert failed for $disk $output" cleanup + exit 1 fi name="datadisk" done @@ -222,6 +224,19 @@ mount_operation() { cleanup() { local status=0 + # Resume the VM if it was paused during backup to prevent it from + # remaining indefinitely paused when the backup job fails (e.g. due + # to storage full or I/O errors on the backup target) + local vm_state + vm_state=$(virsh -c qemu:///system domstate "$VM" 2>/dev/null) + if [[ "$vm_state" == "paused" ]]; then + log -ne "Resuming paused VM $VM during backup cleanup" + if ! virsh -c qemu:///system resume "$VM" > /dev/null 2>&1; then + echo "Failed to resume VM $VM" + status=1 + fi + fi + rm -rf "$dest" || { echo "Failed to delete $dest"; status=1; } umount "$mount_point" || { echo "Failed to unmount $mount_point"; status=1; } rmdir "$mount_point" || { echo "Failed to remove mount point $mount_point"; status=1; }