From ae848bf1c38cd24f22bbbb6786894caeaedc1759 Mon Sep 17 00:00:00 2001 From: CMGS Date: Fri, 15 May 2026 16:05:11 +0800 Subject: [PATCH 1/5] fix: kill orphan VMM via /proc cmdline fallback when pidfile pre-removed PR #50's socket-probe tiebreaker only catches orphans whose api.sock is still listening. If pidfile and api.sock are both pre-removed before the VMM exits (observed on GKE prod after vk-cocoon rapid restart + CH InvalidStateTransition), DeleteAll's pidfile-based stop returns ErrNotRunning, the probe returns ENOENT, and the VMM survives as a PPID=1 orphan with no rundir. Add utils.FindVMMByCmdline as a /proc scan fallback keyed on the api-socket path (already unique per VM). Wire it into: - WithRunningVM: recover the live pid when pidfile/socket are gone - DeleteAll: second-pass after socket probe to catch sibling/worker pids Repro: sleep + rm pidfile + rm api.sock + cocoon vm rm --force leaves a CH orphan. With the fix, the cmdline scan recovers and SIGKILLs it. --- hypervisor/state.go | 17 ++++++++++++++--- hypervisor/stop.go | 14 ++++++++++++-- utils/process_linux.go | 20 ++++++++++++++++++++ utils/process_other.go | 4 ++++ utils/process_test.go | 41 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 91 insertions(+), 5 deletions(-) diff --git a/hypervisor/state.go b/hypervisor/state.go index b261e6fc..eec56b2a 100644 --- a/hypervisor/state.go +++ b/hypervisor/state.go @@ -19,14 +19,25 @@ const socketProbeTimeout = 2 * time.Second // WithRunningVM calls fn if rec still points to a live VM process. func (b *Backend) WithRunningVM(ctx context.Context, rec *VMRecord, fn func(pid int) error) error { + logger := log.WithFunc(b.Typ + ".WithRunningVM") pid, pidErr := utils.ReadPIDFile(b.PIDFilePath(rec.RunDir)) if pidErr != nil && !errors.Is(pidErr, fs.ErrNotExist) { - log.WithFunc(b.Typ+".WithRunningVM").Warnf(ctx, "read PID file: %v", pidErr) + logger.Warnf(ctx, "read PID file: %v", pidErr) } - if !utils.VerifyProcessCmdline(pid, b.Conf.BinaryName(), SocketPath(rec.RunDir)) { + sockPath := SocketPath(rec.RunDir) + if utils.VerifyProcessCmdline(pid, b.Conf.BinaryName(), sockPath) { + return fn(pid) + } + // Covers pidfile/socket cleaned up before VMM exited. + scanned, scanErr := utils.FindVMMByCmdline(b.Conf.BinaryName(), sockPath) + if scanErr != nil { + logger.Warnf(ctx, "scan /proc for VM %s: %v", rec.ID, scanErr) + } + if len(scanned) == 0 { return ErrNotRunning } - return fn(pid) + logger.Warnf(ctx, "VM %s recovered live pids %v via cmdline scan", rec.ID, scanned) + return fn(scanned[0]) } // IsAPISocketLive: (true,nil)=confirmed live; (false,nil)=ENOENT/ECONNREFUSED; (true,err)=fail-closed for unknown dial errors. diff --git a/hypervisor/stop.go b/hypervisor/stop.go index 1028f8c2..14ba0c6e 100644 --- a/hypervisor/stop.go +++ b/hypervisor/stop.go @@ -56,6 +56,7 @@ func (b *Backend) DeleteAll(ctx context.Context, refs []string, force bool, stop if loadErr != nil { return loadErr } + sockPath := SocketPath(rec.RunDir) if runningErr := b.WithRunningVM(ctx, &rec, func(_ int) error { if !force { return fmt.Errorf("running (force required)") @@ -70,9 +71,18 @@ func (b *Backend) DeleteAll(ctx context.Context, refs []string, force bool, stop return ctxErr } if probeErr != nil { - return fmt.Errorf("refuse delete: api socket %s probe inconclusive: %w (resolve the host issue or kill the vmm process then retry)", SocketPath(rec.RunDir), probeErr) + return fmt.Errorf("refuse delete: api socket %s probe inconclusive: %w (resolve the host issue or kill the vmm process then retry)", sockPath, probeErr) + } + return fmt.Errorf("refuse delete: api socket %s still responsive (suspected orphan vmm; kill the vmm process then retry)", sockPath) + } + // Catches workers/siblings the pidfile-based stop didn't see. + if scanned, _ := utils.FindVMMByCmdline(b.Conf.BinaryName(), sockPath); len(scanned) > 0 { + for _, pid := range scanned { + if termErr := utils.TerminateProcess(ctx, pid, b.Conf.BinaryName(), sockPath, b.Conf.TerminateGracePeriod()); termErr != nil { + return fmt.Errorf("terminate orphan VMM pid=%d for VM %s: %w", pid, id, termErr) + } + log.WithFunc(b.Typ+".Delete").Warnf(ctx, "killed orphan VMM pid=%d for VM %s", pid, id) } - return fmt.Errorf("refuse delete: api socket %s still responsive (suspected orphan vmm; kill the vmm process then retry)", SocketPath(rec.RunDir)) } if rmErr := RemoveVMDirs(rec.RunDir, rec.LogDir); rmErr != nil { return fmt.Errorf("cleanup VM dirs: %w", rmErr) diff --git a/utils/process_linux.go b/utils/process_linux.go index 0dafe183..06696f2d 100644 --- a/utils/process_linux.go +++ b/utils/process_linux.go @@ -6,6 +6,7 @@ import ( "fmt" "os" "path/filepath" + "strconv" "strings" ) @@ -24,3 +25,22 @@ func verifyProcessCmdline(pid int, binaryName, expectArg string) (matched, avail } return strings.Contains(rest, expectArg), true } + +// FindVMMByCmdline returns pids whose argv[0] basename matches binaryName and args contain marker. +func FindVMMByCmdline(binaryName, marker string) ([]int, error) { + entries, err := os.ReadDir("/proc") + if err != nil { + return nil, err + } + var pids []int + for _, e := range entries { + pid, err := strconv.Atoi(e.Name()) + if err != nil || pid <= 0 { + continue + } + if matched, _ := verifyProcessCmdline(pid, binaryName, marker); matched { + pids = append(pids, pid) + } + } + return pids, nil +} diff --git a/utils/process_other.go b/utils/process_other.go index f1d91b37..1315b47d 100644 --- a/utils/process_other.go +++ b/utils/process_other.go @@ -5,3 +5,7 @@ package utils func verifyProcessCmdline(_ int, _, _ string) (matched, available bool) { return false, false } + +func FindVMMByCmdline(_, _ string) ([]int, error) { + return nil, nil +} diff --git a/utils/process_test.go b/utils/process_test.go index 624c8184..5be74d02 100644 --- a/utils/process_test.go +++ b/utils/process_test.go @@ -5,6 +5,7 @@ import ( "os" "os/exec" "path/filepath" + "strconv" "testing" "time" ) @@ -244,6 +245,46 @@ func TestTerminateProcess_SIGTERMIgnored_FallsBackToKill(t *testing.T) { <-waitDone } +func TestFindVMMByCmdline(t *testing.T) { + if _, err := os.Stat("/proc/self/cmdline"); err != nil { + t.Skip("/proc not available") + } + marker := "cocoon-find-marker-" + strconv.Itoa(os.Getpid()) + cmd := exec.Command("sleep", "60") + cmd.Args = []string{"sleep", marker, "60"} + if err := cmd.Start(); err != nil { + t.Fatalf("start: %v", err) + } + defer func() { + _ = cmd.Process.Kill() + _ = cmd.Wait() + }() + + // Poll briefly: cmdline is written by execve, so the parent may scan before /proc//cmdline reflects argv. + var pids []int + for range 50 { + got, err := FindVMMByCmdline("sleep", marker) + if err != nil { + t.Fatalf("FindVMMByCmdline: %v", err) + } + if len(got) > 0 { + pids = got + break + } + time.Sleep(10 * time.Millisecond) + } + if len(pids) != 1 || pids[0] != cmd.Process.Pid { + t.Errorf("FindVMMByCmdline: got %v, want [%d]", pids, cmd.Process.Pid) + } + + if got, _ := FindVMMByCmdline("definitely-no-such-binary", marker); len(got) != 0 { + t.Errorf("wrong-binary scan matched: %v", got) + } + if got, _ := FindVMMByCmdline("sleep", "no-such-marker"); len(got) != 0 { + t.Errorf("wrong-marker scan matched: %v", got) + } +} + func TestTerminateProcess_ContextCancelled(t *testing.T) { // Start a process that ignores SIGTERM (sleep handles it by default though). cmd := exec.Command("sleep", "60") From f9d7a1413d7a8a8bccb6ed9466ef87462205f4ee Mon Sep 17 00:00:00 2001 From: CMGS Date: Fri, 15 May 2026 16:13:16 +0800 Subject: [PATCH 2/5] =?UTF-8?q?chore:=20senior-review=20fixes=20=E2=80=94?= =?UTF-8?q?=20public-above-private,=20expectArg=20naming?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Reorder utils/process_*.go so FindVMMByCmdline sits above verifyProcessCmdline (matches sparse_linux.go / reflink_linux.go). - Rename the FindVMMByCmdline marker param to expectArg for consistency with VerifyProcessCmdline / TerminateProcess / pidfd_linux.go. --- utils/process_linux.go | 38 +++++++++++++++++++------------------- utils/process_other.go | 8 ++++---- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/utils/process_linux.go b/utils/process_linux.go index 06696f2d..66c08968 100644 --- a/utils/process_linux.go +++ b/utils/process_linux.go @@ -10,24 +10,8 @@ import ( "strings" ) -// Match argv[0] basename strictly + expectArg substring on the rest so "bash -c 'cloud-hypervisor ...'" can't impersonate the VMM. -func verifyProcessCmdline(pid int, binaryName, expectArg string) (matched, available bool) { - data, err := os.ReadFile(fmt.Sprintf("/proc/%d/cmdline", pid)) - if err != nil { - return false, false - } - argv0, rest, _ := strings.Cut(string(data), "\x00") - if filepath.Base(argv0) != binaryName { - return false, true - } - if expectArg == "" { - return true, true - } - return strings.Contains(rest, expectArg), true -} - -// FindVMMByCmdline returns pids whose argv[0] basename matches binaryName and args contain marker. -func FindVMMByCmdline(binaryName, marker string) ([]int, error) { +// FindVMMByCmdline returns pids whose argv[0] basename matches binaryName and args contain expectArg. +func FindVMMByCmdline(binaryName, expectArg string) ([]int, error) { entries, err := os.ReadDir("/proc") if err != nil { return nil, err @@ -38,9 +22,25 @@ func FindVMMByCmdline(binaryName, marker string) ([]int, error) { if err != nil || pid <= 0 { continue } - if matched, _ := verifyProcessCmdline(pid, binaryName, marker); matched { + if matched, _ := verifyProcessCmdline(pid, binaryName, expectArg); matched { pids = append(pids, pid) } } return pids, nil } + +// Match argv[0] basename strictly + expectArg substring on the rest so "bash -c 'cloud-hypervisor ...'" can't impersonate the VMM. +func verifyProcessCmdline(pid int, binaryName, expectArg string) (matched, available bool) { + data, err := os.ReadFile(fmt.Sprintf("/proc/%d/cmdline", pid)) + if err != nil { + return false, false + } + argv0, rest, _ := strings.Cut(string(data), "\x00") + if filepath.Base(argv0) != binaryName { + return false, true + } + if expectArg == "" { + return true, true + } + return strings.Contains(rest, expectArg), true +} diff --git a/utils/process_other.go b/utils/process_other.go index 1315b47d..088634e1 100644 --- a/utils/process_other.go +++ b/utils/process_other.go @@ -2,10 +2,10 @@ package utils -func verifyProcessCmdline(_ int, _, _ string) (matched, available bool) { - return false, false -} - func FindVMMByCmdline(_, _ string) ([]int, error) { return nil, nil } + +func verifyProcessCmdline(_ int, _, _ string) (matched, available bool) { + return false, false +} From a75442fa58dd10e1fdfbf20216e6d3179924ddae Mon Sep 17 00:00:00 2001 From: CMGS Date: Fri, 15 May 2026 16:23:05 +0800 Subject: [PATCH 3/5] fix: address Copilot round-1 findings on orphan VMM PR - utils/process_linux.go: slices.Sort the returned pids so callers get a deterministic smallest-pid choice (Copilot caught the /proc lexicographic ordering trap, e.g. "100" < "11"). - hypervisor/state.go: fail-closed when /proc scan errors after pidfile-based check fails; previously returned ErrNotRunning on inconclusive state, which could let start/delete proceed against a still-running VM. - hypervisor/stop.go: fail-closed in DeleteAll second-pass when /proc scan errors; previously dropped scanErr and risked re-introducing the orphan leak the PR is trying to fix. - utils/process_test.go: replace flaky "sleep marker 60" (sleep rejects non-numeric arg and exits immediately) with "sh -c 'sleep 60 && :' marker" (compound prevents sh tail-exec into sleep). Gate on runtime.GOOS == "linux". --- hypervisor/state.go | 4 ++-- hypervisor/stop.go | 16 +++++++++------- utils/process_linux.go | 4 +++- utils/process_test.go | 13 +++++++------ 4 files changed, 21 insertions(+), 16 deletions(-) diff --git a/hypervisor/state.go b/hypervisor/state.go index eec56b2a..584b7a04 100644 --- a/hypervisor/state.go +++ b/hypervisor/state.go @@ -28,10 +28,10 @@ func (b *Backend) WithRunningVM(ctx context.Context, rec *VMRecord, fn func(pid if utils.VerifyProcessCmdline(pid, b.Conf.BinaryName(), sockPath) { return fn(pid) } - // Covers pidfile/socket cleaned up before VMM exited. + // Covers pidfile/socket cleaned up before VMM exited. Fail-closed if scan errors so callers don't treat inconclusive state as ErrNotRunning. scanned, scanErr := utils.FindVMMByCmdline(b.Conf.BinaryName(), sockPath) if scanErr != nil { - logger.Warnf(ctx, "scan /proc for VM %s: %v", rec.ID, scanErr) + return fmt.Errorf("VM %s: pidfile-based check failed and /proc scan errored: %w", rec.ID, scanErr) } if len(scanned) == 0 { return ErrNotRunning diff --git a/hypervisor/stop.go b/hypervisor/stop.go index 14ba0c6e..a62af6e3 100644 --- a/hypervisor/stop.go +++ b/hypervisor/stop.go @@ -75,14 +75,16 @@ func (b *Backend) DeleteAll(ctx context.Context, refs []string, force bool, stop } return fmt.Errorf("refuse delete: api socket %s still responsive (suspected orphan vmm; kill the vmm process then retry)", sockPath) } - // Catches workers/siblings the pidfile-based stop didn't see. - if scanned, _ := utils.FindVMMByCmdline(b.Conf.BinaryName(), sockPath); len(scanned) > 0 { - for _, pid := range scanned { - if termErr := utils.TerminateProcess(ctx, pid, b.Conf.BinaryName(), sockPath, b.Conf.TerminateGracePeriod()); termErr != nil { - return fmt.Errorf("terminate orphan VMM pid=%d for VM %s: %w", pid, id, termErr) - } - log.WithFunc(b.Typ+".Delete").Warnf(ctx, "killed orphan VMM pid=%d for VM %s", pid, id) + // Catches workers/siblings the pidfile-based stop didn't see; fail-closed on scan error so we never wipe rundir while VMM state is unknown. + scanned, scanErr := utils.FindVMMByCmdline(b.Conf.BinaryName(), sockPath) + if scanErr != nil { + return fmt.Errorf("refuse delete: VM %s /proc scan errored: %w (resolve host issue and retry)", id, scanErr) + } + for _, pid := range scanned { + if termErr := utils.TerminateProcess(ctx, pid, b.Conf.BinaryName(), sockPath, b.Conf.TerminateGracePeriod()); termErr != nil { + return fmt.Errorf("terminate orphan VMM pid=%d for VM %s: %w", pid, id, termErr) } + log.WithFunc(b.Typ+".Delete").Warnf(ctx, "killed orphan VMM pid=%d for VM %s", pid, id) } if rmErr := RemoveVMDirs(rec.RunDir, rec.LogDir); rmErr != nil { return fmt.Errorf("cleanup VM dirs: %w", rmErr) diff --git a/utils/process_linux.go b/utils/process_linux.go index 66c08968..c78ea815 100644 --- a/utils/process_linux.go +++ b/utils/process_linux.go @@ -6,11 +6,12 @@ import ( "fmt" "os" "path/filepath" + "slices" "strconv" "strings" ) -// FindVMMByCmdline returns pids whose argv[0] basename matches binaryName and args contain expectArg. +// FindVMMByCmdline returns pids whose argv[0] basename matches binaryName and args contain expectArg, sorted numerically. func FindVMMByCmdline(binaryName, expectArg string) ([]int, error) { entries, err := os.ReadDir("/proc") if err != nil { @@ -26,6 +27,7 @@ func FindVMMByCmdline(binaryName, expectArg string) ([]int, error) { pids = append(pids, pid) } } + slices.Sort(pids) return pids, nil } diff --git a/utils/process_test.go b/utils/process_test.go index 5be74d02..851f9096 100644 --- a/utils/process_test.go +++ b/utils/process_test.go @@ -5,6 +5,7 @@ import ( "os" "os/exec" "path/filepath" + "runtime" "strconv" "testing" "time" @@ -246,12 +247,12 @@ func TestTerminateProcess_SIGTERMIgnored_FallsBackToKill(t *testing.T) { } func TestFindVMMByCmdline(t *testing.T) { - if _, err := os.Stat("/proc/self/cmdline"); err != nil { - t.Skip("/proc not available") + if runtime.GOOS != "linux" { + t.Skip("FindVMMByCmdline scans /proc — linux only") } marker := "cocoon-find-marker-" + strconv.Itoa(os.Getpid()) - cmd := exec.Command("sleep", "60") - cmd.Args = []string{"sleep", marker, "60"} + // "sleep 60 && :" is a compound command so sh can't tail-exec into sleep and lose the marker arg. + cmd := exec.Command("sh", "-c", "sleep 60 && :", marker) if err := cmd.Start(); err != nil { t.Fatalf("start: %v", err) } @@ -263,7 +264,7 @@ func TestFindVMMByCmdline(t *testing.T) { // Poll briefly: cmdline is written by execve, so the parent may scan before /proc//cmdline reflects argv. var pids []int for range 50 { - got, err := FindVMMByCmdline("sleep", marker) + got, err := FindVMMByCmdline("sh", marker) if err != nil { t.Fatalf("FindVMMByCmdline: %v", err) } @@ -280,7 +281,7 @@ func TestFindVMMByCmdline(t *testing.T) { if got, _ := FindVMMByCmdline("definitely-no-such-binary", marker); len(got) != 0 { t.Errorf("wrong-binary scan matched: %v", got) } - if got, _ := FindVMMByCmdline("sleep", "no-such-marker"); len(got) != 0 { + if got, _ := FindVMMByCmdline("sh", "no-such-marker-"+marker); len(got) != 0 { t.Errorf("wrong-marker scan matched: %v", got) } } From 1b69b42c982066a182c4283183323015ba703034 Mon Sep 17 00:00:00 2001 From: CMGS Date: Fri, 15 May 2026 17:26:27 +0800 Subject: [PATCH 4/5] chore: align fail-closed error strings with sibling refuse-delete wording state.go + stop.go: add "(resolve the host issue and retry)" actionable-hint clause so the new scan-error wraps match the existing socket-probe error format. --- hypervisor/state.go | 2 +- hypervisor/stop.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/hypervisor/state.go b/hypervisor/state.go index 584b7a04..747bcff3 100644 --- a/hypervisor/state.go +++ b/hypervisor/state.go @@ -31,7 +31,7 @@ func (b *Backend) WithRunningVM(ctx context.Context, rec *VMRecord, fn func(pid // Covers pidfile/socket cleaned up before VMM exited. Fail-closed if scan errors so callers don't treat inconclusive state as ErrNotRunning. scanned, scanErr := utils.FindVMMByCmdline(b.Conf.BinaryName(), sockPath) if scanErr != nil { - return fmt.Errorf("VM %s: pidfile-based check failed and /proc scan errored: %w", rec.ID, scanErr) + return fmt.Errorf("VM %s: pidfile-based check failed and /proc scan errored: %w (resolve the host issue and retry)", rec.ID, scanErr) } if len(scanned) == 0 { return ErrNotRunning diff --git a/hypervisor/stop.go b/hypervisor/stop.go index a62af6e3..5077fb89 100644 --- a/hypervisor/stop.go +++ b/hypervisor/stop.go @@ -78,7 +78,7 @@ func (b *Backend) DeleteAll(ctx context.Context, refs []string, force bool, stop // Catches workers/siblings the pidfile-based stop didn't see; fail-closed on scan error so we never wipe rundir while VMM state is unknown. scanned, scanErr := utils.FindVMMByCmdline(b.Conf.BinaryName(), sockPath) if scanErr != nil { - return fmt.Errorf("refuse delete: VM %s /proc scan errored: %w (resolve host issue and retry)", id, scanErr) + return fmt.Errorf("refuse delete: VM %s /proc scan errored: %w (resolve the host issue and retry)", id, scanErr) } for _, pid := range scanned { if termErr := utils.TerminateProcess(ctx, pid, b.Conf.BinaryName(), sockPath, b.Conf.TerminateGracePeriod()); termErr != nil { From 158e536c8d383987719187e5be05f51c041fa77b Mon Sep 17 00:00:00 2001 From: CMGS Date: Fri, 15 May 2026 17:37:09 +0800 Subject: [PATCH 5/5] fix: surface non-ENOENT cmdline read errors so FindVMMByCmdline fails closed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Copilot round-3 finding: verifyProcessCmdline returned (false, available=false) on permission/IO errors reading /proc//cmdline (e.g. hidepid/EPERM), and FindVMMByCmdline silently dropped that signal — a hidepid environment could mask the real VMM and reintroduce the orphan leak. Refactor verifyProcessCmdline to return (bool, error); FindVMMByCmdline now distinguishes ENOENT (transient race, safe to skip) from any other read error (fail-closed, return wrapped first error). VerifyProcessCmdline wrapper preserves the "fall back to IsProcessAlive on error" semantic. --- utils/process.go | 4 ++-- utils/process_linux.go | 34 ++++++++++++++++++++++++---------- utils/process_other.go | 8 ++++++-- 3 files changed, 32 insertions(+), 14 deletions(-) diff --git a/utils/process.go b/utils/process.go index 7202dc41..9fc43be8 100644 --- a/utils/process.go +++ b/utils/process.go @@ -41,12 +41,12 @@ func IsProcessAlive(pid int) bool { } // VerifyProcessCmdline matches pid against binaryName + expectArg in -// /proc//cmdline; falls back to IsProcessAlive on non-Linux. +// /proc//cmdline; falls back to IsProcessAlive on non-Linux or read errors. func VerifyProcessCmdline(pid int, binaryName, expectArg string) bool { if pid <= 0 { return false } - if match, ok := verifyProcessCmdline(pid, binaryName, expectArg); ok { + if match, err := verifyProcessCmdline(pid, binaryName, expectArg); err == nil { return match } return IsProcessAlive(pid) diff --git a/utils/process_linux.go b/utils/process_linux.go index c78ea815..39a4afb3 100644 --- a/utils/process_linux.go +++ b/utils/process_linux.go @@ -3,7 +3,9 @@ package utils import ( + "errors" "fmt" + "io/fs" "os" "path/filepath" "slices" @@ -11,38 +13,50 @@ import ( "strings" ) -// FindVMMByCmdline returns pids whose argv[0] basename matches binaryName and args contain expectArg, sorted numerically. +// FindVMMByCmdline returns pids whose argv[0] basename matches binaryName and args contain expectArg, sorted numerically; fails closed on non-ENOENT cmdline read errors. func FindVMMByCmdline(binaryName, expectArg string) ([]int, error) { entries, err := os.ReadDir("/proc") if err != nil { return nil, err } var pids []int + var firstErr error for _, e := range entries { - pid, err := strconv.Atoi(e.Name()) - if err != nil || pid <= 0 { + pid, atoiErr := strconv.Atoi(e.Name()) + if atoiErr != nil || pid <= 0 { continue } - if matched, _ := verifyProcessCmdline(pid, binaryName, expectArg); matched { + matched, readErr := verifyProcessCmdline(pid, binaryName, expectArg) + if readErr != nil { + // ENOENT = process exited mid-scan, safe to skip; everything else means we can't tell, so callers must fail closed. + if !errors.Is(readErr, fs.ErrNotExist) && firstErr == nil { + firstErr = fmt.Errorf("read /proc/%d/cmdline: %w", pid, readErr) + } + continue + } + if matched { pids = append(pids, pid) } } + if firstErr != nil { + return nil, firstErr + } slices.Sort(pids) return pids, nil } -// Match argv[0] basename strictly + expectArg substring on the rest so "bash -c 'cloud-hypervisor ...'" can't impersonate the VMM. -func verifyProcessCmdline(pid int, binaryName, expectArg string) (matched, available bool) { +// Match argv[0] basename strictly + expectArg substring on the rest so "bash -c 'cloud-hypervisor ...'" can't impersonate the VMM; error surfaces cmdline-read failures so callers distinguish transient ENOENT from real issues. +func verifyProcessCmdline(pid int, binaryName, expectArg string) (bool, error) { data, err := os.ReadFile(fmt.Sprintf("/proc/%d/cmdline", pid)) if err != nil { - return false, false + return false, err } argv0, rest, _ := strings.Cut(string(data), "\x00") if filepath.Base(argv0) != binaryName { - return false, true + return false, nil } if expectArg == "" { - return true, true + return true, nil } - return strings.Contains(rest, expectArg), true + return strings.Contains(rest, expectArg), nil } diff --git a/utils/process_other.go b/utils/process_other.go index 088634e1..4f479f83 100644 --- a/utils/process_other.go +++ b/utils/process_other.go @@ -2,10 +2,14 @@ package utils +import "errors" + +var errVerifyUnsupported = errors.New("verifyProcessCmdline: unsupported on this OS") + func FindVMMByCmdline(_, _ string) ([]int, error) { return nil, nil } -func verifyProcessCmdline(_ int, _, _ string) (matched, available bool) { - return false, false +func verifyProcessCmdline(_ int, _, _ string) (bool, error) { + return false, errVerifyUnsupported }