Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions hypervisor/state.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,25 @@ const socketProbeTimeout = 2 * time.Second

// WithRunningVM calls fn if rec still points to a live VM process.
func (b *Backend) WithRunningVM(ctx context.Context, rec *VMRecord, fn func(pid int) error) error {
logger := log.WithFunc(b.Typ + ".WithRunningVM")
pid, pidErr := utils.ReadPIDFile(b.PIDFilePath(rec.RunDir))
if pidErr != nil && !errors.Is(pidErr, fs.ErrNotExist) {
log.WithFunc(b.Typ+".WithRunningVM").Warnf(ctx, "read PID file: %v", pidErr)
logger.Warnf(ctx, "read PID file: %v", pidErr)
}
if !utils.VerifyProcessCmdline(pid, b.Conf.BinaryName(), SocketPath(rec.RunDir)) {
sockPath := SocketPath(rec.RunDir)
if utils.VerifyProcessCmdline(pid, b.Conf.BinaryName(), sockPath) {
return fn(pid)
}
// Covers pidfile/socket cleaned up before VMM exited. Fail-closed if scan errors so callers don't treat inconclusive state as ErrNotRunning.
scanned, scanErr := utils.FindVMMByCmdline(b.Conf.BinaryName(), sockPath)
if scanErr != nil {
return fmt.Errorf("VM %s: pidfile-based check failed and /proc scan errored: %w (resolve the host issue and retry)", rec.ID, scanErr)
}
if len(scanned) == 0 {
Comment thread
CMGS marked this conversation as resolved.
return ErrNotRunning
}
return fn(pid)
logger.Warnf(ctx, "VM %s recovered live pids %v via cmdline scan", rec.ID, scanned)
return fn(scanned[0])
}

// IsAPISocketLive: (true,nil)=confirmed live; (false,nil)=ENOENT/ECONNREFUSED; (true,err)=fail-closed for unknown dial errors.
Expand Down
16 changes: 14 additions & 2 deletions hypervisor/stop.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ func (b *Backend) DeleteAll(ctx context.Context, refs []string, force bool, stop
if loadErr != nil {
return loadErr
}
sockPath := SocketPath(rec.RunDir)
if runningErr := b.WithRunningVM(ctx, &rec, func(_ int) error {
if !force {
return fmt.Errorf("running (force required)")
Expand All @@ -70,9 +71,20 @@ func (b *Backend) DeleteAll(ctx context.Context, refs []string, force bool, stop
return ctxErr
}
if probeErr != nil {
return fmt.Errorf("refuse delete: api socket %s probe inconclusive: %w (resolve the host issue or kill the vmm process then retry)", SocketPath(rec.RunDir), probeErr)
return fmt.Errorf("refuse delete: api socket %s probe inconclusive: %w (resolve the host issue or kill the vmm process then retry)", sockPath, probeErr)
}
return fmt.Errorf("refuse delete: api socket %s still responsive (suspected orphan vmm; kill the vmm process then retry)", SocketPath(rec.RunDir))
return fmt.Errorf("refuse delete: api socket %s still responsive (suspected orphan vmm; kill the vmm process then retry)", sockPath)
}
// Catches workers/siblings the pidfile-based stop didn't see; fail-closed on scan error so we never wipe rundir while VMM state is unknown.
scanned, scanErr := utils.FindVMMByCmdline(b.Conf.BinaryName(), sockPath)
if scanErr != nil {
return fmt.Errorf("refuse delete: VM %s /proc scan errored: %w (resolve the host issue and retry)", id, scanErr)
}
for _, pid := range scanned {
if termErr := utils.TerminateProcess(ctx, pid, b.Conf.BinaryName(), sockPath, b.Conf.TerminateGracePeriod()); termErr != nil {
return fmt.Errorf("terminate orphan VMM pid=%d for VM %s: %w", pid, id, termErr)
}
log.WithFunc(b.Typ+".Delete").Warnf(ctx, "killed orphan VMM pid=%d for VM %s", pid, id)
}
if rmErr := RemoveVMDirs(rec.RunDir, rec.LogDir); rmErr != nil {
return fmt.Errorf("cleanup VM dirs: %w", rmErr)
Expand Down
4 changes: 2 additions & 2 deletions utils/process.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,12 @@ func IsProcessAlive(pid int) bool {
}

// VerifyProcessCmdline matches pid against binaryName + expectArg in
// /proc/<pid>/cmdline; falls back to IsProcessAlive on non-Linux.
// /proc/<pid>/cmdline; falls back to IsProcessAlive on non-Linux or read errors.
func VerifyProcessCmdline(pid int, binaryName, expectArg string) bool {
if pid <= 0 {
return false
}
if match, ok := verifyProcessCmdline(pid, binaryName, expectArg); ok {
if match, err := verifyProcessCmdline(pid, binaryName, expectArg); err == nil {
return match
}
return IsProcessAlive(pid)
Expand Down
48 changes: 42 additions & 6 deletions utils/process_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,24 +3,60 @@
package utils

import (
"errors"
"fmt"
"io/fs"
"os"
"path/filepath"
"slices"
"strconv"
"strings"
)

// Match argv[0] basename strictly + expectArg substring on the rest so "bash -c 'cloud-hypervisor ...'" can't impersonate the VMM.
func verifyProcessCmdline(pid int, binaryName, expectArg string) (matched, available bool) {
// FindVMMByCmdline returns pids whose argv[0] basename matches binaryName and args contain expectArg, sorted numerically; fails closed on non-ENOENT cmdline read errors.
func FindVMMByCmdline(binaryName, expectArg string) ([]int, error) {
entries, err := os.ReadDir("/proc")
if err != nil {
return nil, err
}
var pids []int
var firstErr error
for _, e := range entries {
pid, atoiErr := strconv.Atoi(e.Name())
if atoiErr != nil || pid <= 0 {
continue
}
matched, readErr := verifyProcessCmdline(pid, binaryName, expectArg)
if readErr != nil {
// ENOENT = process exited mid-scan, safe to skip; everything else means we can't tell, so callers must fail closed.
if !errors.Is(readErr, fs.ErrNotExist) && firstErr == nil {
firstErr = fmt.Errorf("read /proc/%d/cmdline: %w", pid, readErr)
}
continue
}
if matched {
pids = append(pids, pid)
}
}
Comment thread
CMGS marked this conversation as resolved.
if firstErr != nil {
return nil, firstErr
}
slices.Sort(pids)
return pids, nil
Comment thread
CMGS marked this conversation as resolved.
}

// Match argv[0] basename strictly + expectArg substring on the rest so "bash -c 'cloud-hypervisor ...'" can't impersonate the VMM; error surfaces cmdline-read failures so callers distinguish transient ENOENT from real issues.
func verifyProcessCmdline(pid int, binaryName, expectArg string) (bool, error) {
data, err := os.ReadFile(fmt.Sprintf("/proc/%d/cmdline", pid))
if err != nil {
return false, false
return false, err
}
argv0, rest, _ := strings.Cut(string(data), "\x00")
if filepath.Base(argv0) != binaryName {
return false, true
return false, nil
}
if expectArg == "" {
return true, true
return true, nil
}
return strings.Contains(rest, expectArg), true
return strings.Contains(rest, expectArg), nil
}
12 changes: 10 additions & 2 deletions utils/process_other.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,14 @@

package utils

func verifyProcessCmdline(_ int, _, _ string) (matched, available bool) {
return false, false
import "errors"

var errVerifyUnsupported = errors.New("verifyProcessCmdline: unsupported on this OS")

func FindVMMByCmdline(_, _ string) ([]int, error) {
return nil, nil
}

func verifyProcessCmdline(_ int, _, _ string) (bool, error) {
return false, errVerifyUnsupported
}
42 changes: 42 additions & 0 deletions utils/process_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ import (
"os"
"os/exec"
"path/filepath"
"runtime"
"strconv"
"testing"
"time"
)
Expand Down Expand Up @@ -244,6 +246,46 @@ func TestTerminateProcess_SIGTERMIgnored_FallsBackToKill(t *testing.T) {
<-waitDone
}

func TestFindVMMByCmdline(t *testing.T) {
if runtime.GOOS != "linux" {
t.Skip("FindVMMByCmdline scans /proc — linux only")
}
marker := "cocoon-find-marker-" + strconv.Itoa(os.Getpid())
// "sleep 60 && :" is a compound command so sh can't tail-exec into sleep and lose the marker arg.
cmd := exec.Command("sh", "-c", "sleep 60 && :", marker)
if err := cmd.Start(); err != nil {
Comment thread
CMGS marked this conversation as resolved.
t.Fatalf("start: %v", err)
}
defer func() {
_ = cmd.Process.Kill()
_ = cmd.Wait()
}()

// Poll briefly: cmdline is written by execve, so the parent may scan before /proc/<pid>/cmdline reflects argv.
var pids []int
for range 50 {
got, err := FindVMMByCmdline("sh", marker)
if err != nil {
t.Fatalf("FindVMMByCmdline: %v", err)
}
if len(got) > 0 {
pids = got
break
}
time.Sleep(10 * time.Millisecond)
}
if len(pids) != 1 || pids[0] != cmd.Process.Pid {
t.Errorf("FindVMMByCmdline: got %v, want [%d]", pids, cmd.Process.Pid)
}

if got, _ := FindVMMByCmdline("definitely-no-such-binary", marker); len(got) != 0 {
t.Errorf("wrong-binary scan matched: %v", got)
}
if got, _ := FindVMMByCmdline("sh", "no-such-marker-"+marker); len(got) != 0 {
t.Errorf("wrong-marker scan matched: %v", got)
}
}

func TestTerminateProcess_ContextCancelled(t *testing.T) {
// Start a process that ignores SIGTERM (sleep handles it by default though).
cmd := exec.Command("sleep", "60")
Expand Down
Loading