diff --git a/hypervisor/cloudhypervisor/helper.go b/hypervisor/cloudhypervisor/helper.go index 207f65f..9270387 100644 --- a/hypervisor/cloudhypervisor/helper.go +++ b/hypervisor/cloudhypervisor/helper.go @@ -3,6 +3,7 @@ package cloudhypervisor import ( "context" "encoding/json" + "errors" "fmt" "net/http" "os" @@ -112,16 +113,33 @@ func shutdownVM(ctx context.Context, hc *http.Client) error { return err } +// pauseVM is idempotent — swallows CH's Paused→Paused 500 so a stuck-paused VM recovers. func pauseVM(ctx context.Context, hc *http.Client) error { _, err := vmAPIOnce(ctx, hc, "vm.pause", nil) + if err != nil && isAlreadyInStateError(err, "Paused") { + return nil + } return err } +// resumeVM is idempotent — swallows CH's Running→Running 500. func resumeVM(ctx context.Context, hc *http.Client) error { _, err := vmAPIOnce(ctx, hc, "vm.resume", nil) + if err != nil && isAlreadyInStateError(err, "Running") { + return nil + } return err } +// isAlreadyInStateError matches CH's exact `Invalid transition: InvalidStateTransition(, )` in a 500 body. +func isAlreadyInStateError(err error, state string) bool { + var ae *utils.APIError + if !errors.As(err, &ae) || ae.Code != http.StatusInternalServerError { + return false + } + return strings.Contains(ae.Message, fmt.Sprintf("Invalid transition: InvalidStateTransition(%s, %s)", state, state)) +} + // snapshotVM and restoreVM temporarily extend the client timeout for // long-running memory transfers, then restore it for subsequent calls. func snapshotVM(ctx context.Context, hc *http.Client, destDir string) error { diff --git a/hypervisor/cloudhypervisor/helper_test.go b/hypervisor/cloudhypervisor/helper_test.go new file mode 100644 index 0000000..f9195db --- /dev/null +++ b/hypervisor/cloudhypervisor/helper_test.go @@ -0,0 +1,39 @@ +package cloudhypervisor + +import ( + "errors" + "fmt" + "net/http" + "testing" + + "github.com/cocoonstack/cocoon/utils" +) + +func TestIsAlreadyInStateError(t *testing.T) { + chPaused := `PUT http://localhost/api/v1/vm.pause → 500: ["Error from API","The VM could not be paused","Cannot pause VM","Failed to pause migratable component","Invalid transition: InvalidStateTransition(Paused, Paused)"]` + chRunning := `PUT http://localhost/api/v1/vm.resume → 500: ["Error from API","Cannot resume VM","Failed","Invalid transition: InvalidStateTransition(Running, Running)"]` + + tests := []struct { + name string + err error + state string + want bool + }{ + {name: "paused paused match", err: &utils.APIError{Code: http.StatusInternalServerError, Message: chPaused}, state: "Paused", want: true}, + {name: "running running match", err: &utils.APIError{Code: http.StatusInternalServerError, Message: chRunning}, state: "Running", want: true}, + {name: "wrong state in match", err: &utils.APIError{Code: http.StatusInternalServerError, Message: chPaused}, state: "Running", want: false}, + {name: "non-500 code", err: &utils.APIError{Code: http.StatusBadRequest, Message: chPaused}, state: "Paused", want: false}, + {name: "different transition (Created→Paused)", err: &utils.APIError{Code: http.StatusInternalServerError, Message: "InvalidStateTransition(Created, Paused)"}, state: "Paused", want: false}, + {name: "non-APIError", err: errors.New("dial unix: connection refused"), state: "Paused", want: false}, + {name: "nil error", err: nil, state: "Paused", want: false}, + {name: "wrapped APIError", err: fmt.Errorf("snapshot save: %w", &utils.APIError{Code: http.StatusInternalServerError, Message: chPaused}), state: "Paused", want: true}, + {name: "empty state", err: &utils.APIError{Code: http.StatusInternalServerError, Message: chPaused}, state: "", want: false}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := isAlreadyInStateError(tt.err, tt.state); got != tt.want { + t.Errorf("isAlreadyInStateError(%v, %q) = %v, want %v", tt.err, tt.state, got, tt.want) + } + }) + } +} diff --git a/hypervisor/firecracker/api.go b/hypervisor/firecracker/api.go index 27ed966..73a1a81 100644 --- a/hypervisor/firecracker/api.go +++ b/hypervisor/firecracker/api.go @@ -169,8 +169,7 @@ func sendCtrlAltDel(ctx context.Context, hc *http.Client) error { return fcAPIOnce(ctx, hc, http.MethodPut, "/actions", body) } -// pauseVM pauses a running FC instance via PATCH /vm. Non-idempotent: a -// retry after a lost response would hit "already paused" and mask success. +// pauseVM pauses a running FC instance via PATCH /vm. Idempotent: FC's vCPU event loop acks Pause from the paused state without error (vstate/vcpu.rs). func pauseVM(ctx context.Context, hc *http.Client) error { body, err := json.Marshal(map[string]string{"state": vmStatePaused}) if err != nil { @@ -179,8 +178,7 @@ func pauseVM(ctx context.Context, hc *http.Client) error { return fcAPIOnce(ctx, hc, http.MethodPatch, "/vm", body) } -// resumeVM resumes a paused FC instance via PATCH /vm. Same non-idempotent -// shape as pauseVM. +// resumeVM resumes a paused FC instance via PATCH /vm. Idempotent like pauseVM. func resumeVM(ctx context.Context, hc *http.Client) error { body, err := json.Marshal(map[string]string{"state": vmStateResumed}) if err != nil {