From b4cfef1c5bfe82d3a8ee92159602dcf0c991d690 Mon Sep 17 00:00:00 2001
From: Jayson Grace <jayson.e.grace@gmail.com>
Date: Sun, 21 Jun 2026 16:46:27 -0600
Subject: [PATCH 1/2] fix: distinguish transient empty output from genuine
 empty MSSQL results (#2)

**Key Changes:**

- Introduced `mssqlProbe` with a sentinel-based protocol to reliably separate a genuine empty SQL result set from a host still settling after provisioning
- Added retry logic in `runPSErr` and `runScriptJSON` to absorb transient empty-but-successful responses without masking real "not found" results
- Upgraded all MSSQL check call sites to handle the new `(string, bool)` return so probes that cannot complete emit WARN instead of a bogus FAIL
- Added comprehensive unit tests covering the sentinel, retry, recovery, and transport-error paths

**Added:**

- `mssqlProbe` method - new dedicated SQL probe that appends `sqlProbeSentinel` to every query script and uses its presence to distinguish a completed-but-empty result (`ok=true, rows=""`) from a probe that never finished (`ok=false`); callers WARN rather than FAIL on the latter
- `sqlProbeSentinel` and `transientRetries` constants plus `backoffBase` variable and `backoffSleep` helper in `script_runner.go` to centralise retry policy and allow tests to shrink backoff to milliseconds
- `runPSTransport` method extracted from `runPSErr` to own transport-level retries and dead-host tracking, keeping the two concerns separate
- `probe_retry_test.go` - seven focused tests covering: genuine empty is definitive (no retry), transient empty retries then warns, mid-probe recovery, transport error does not amplify retries, `runPSErr` retry behaviour, non-empty returns immediately, and `runScriptJSON` envelope retry

**Changed:**

- `runPSErr` now wraps `runPSTransport` and adds an outer loop that retries empty-but-successful responses up to `transientRetries` times with backoff before returning a blank result with `nil` error, preserving the existing contract for callers
- `runScriptJSON` replaced the single `runPS` call with a retry loop over `runPSErr` that re-runs when the JSON envelope is absent (a transient settling signature) and fails fast only on malformed envelopes, which indicate a script bug rather than a transient blip
- `mssqlQueryFn` type signature updated from `func(...) string` to `func(...) (string, bool)` to propagate probe completion status through `checkMSSQLExtendedFeatures`
- All three MSSQL check loops (sysadmins, `EXECUTE AS LOGIN`, linked servers) and the `xp_cmdshell`/`TRUSTWORTHY` checks now use `switch`/`if !ok` guards to emit WARN with a `(host settling?)` hint when a probe does not complete, instead of silently treating the empty string as a definitive negative
---
 cli/internal/validate/checks.go           |  96 +++++++---
 cli/internal/validate/probe_retry_test.go | 214 ++++++++++++++++++++++
 cli/internal/validate/script_runner.go    |  75 ++++++--
 cli/internal/validate/validator.go        |  34 ++++
 4 files changed, 378 insertions(+), 41 deletions(-)
 create mode 100644 cli/internal/validate/probe_retry_test.go

diff --git a/cli/internal/validate/checks.go b/cli/internal/validate/checks.go
index 4efe5270..597b26af 100644
--- a/cli/internal/validate/checks.go
+++ b/cli/internal/validate/checks.go
@@ -320,54 +320,54 @@ func (v *Validator) checkMSSQL(ctx context.Context, w io.Writer) {
 		}
 		v.addResult(w, "PASS", "MSSQL", fmt.Sprintf("MSSQL running on %s", hostLabel), "")
 
-		sqlQuery := func(sqlTmpl string, vars map[string]any) string {
-			out, err := runScriptText(ctx, v, host,
-				`$c = New-Object System.Data.SqlClient.SqlConnection 'Server=localhost;Integrated Security=True;TrustServerCertificate=True'; `+
-					`$c.Open(); $cmd = $c.CreateCommand(); `+
-					`$cmd.CommandText = @"`+"\n"+sqlTmpl+"\n"+`"@; `+
-					`$r = $cmd.ExecuteReader(); while ($r.Read()) { Write-Output $r[0].ToString() }; $r.Close(); $c.Close()`,
-				vars)
-			if err != nil {
-				return ""
-			}
-			return out
+		sqlQuery := func(sqlTmpl string, vars map[string]any) (string, bool) {
+			return v.mssqlProbe(ctx, host, sqlTmpl, vars)
 		}
 
 		for _, admin := range mf.MSSQL.SysAdmins {
-			output = sqlQuery(
+			rows, ok := sqlQuery(
 				`SELECT m.name FROM sys.server_role_members srm `+
 					`JOIN sys.server_principals r ON srm.role_principal_id = r.principal_id `+
 					`JOIN sys.server_principals m ON srm.member_principal_id = m.principal_id `+
 					`WHERE r.name = 'sysadmin' AND m.name = {{psq .Admin}}`,
 				map[string]any{"Admin": admin})
-			if output != "" {
+			switch {
+			case !ok:
+				v.addResult(w, "WARN", "MSSQL", fmt.Sprintf("Could not determine sysadmin status of %s on %s (host settling?)", admin, hostLabel), "")
+			case rows != "":
 				v.addResult(w, "PASS", "MSSQL", fmt.Sprintf("%s is sysadmin on %s", admin, hostLabel), "")
-			} else {
+			default:
 				v.addResult(w, "FAIL", "MSSQL", fmt.Sprintf("%s is NOT sysadmin on %s", admin, hostLabel), "")
 			}
 		}
 
 		for grantee, target := range mf.MSSQL.ExecuteAsLogin {
-			output = sqlQuery(
+			rows, ok := sqlQuery(
 				`SELECT pr.name FROM sys.server_permissions sp `+
 					`JOIN sys.server_principals pr ON sp.grantee_principal_id = pr.principal_id `+
 					`JOIN sys.server_principals pr2 ON sp.major_id = pr2.principal_id `+
 					`WHERE sp.permission_name = 'IMPERSONATE' AND pr.name = {{psq .Grantee}} AND pr2.name = {{psq .Target}}`,
 				map[string]any{"Grantee": grantee, "Target": target})
-			if output != "" {
+			switch {
+			case !ok:
+				v.addResult(w, "WARN", "MSSQL", fmt.Sprintf("Could not determine IMPERSONATE grant for %s->%s on %s (host settling?)", grantee, target, hostLabel), "")
+			case rows != "":
 				v.addResult(w, "PASS", "MSSQL", fmt.Sprintf("%s can impersonate %s on %s", grantee, target, hostLabel), "")
-			} else {
+			default:
 				v.addResult(w, "FAIL", "MSSQL", fmt.Sprintf("%s CANNOT impersonate %s on %s", grantee, target, hostLabel), "")
 			}
 		}
 
 		for name, ls := range mf.MSSQL.LinkedServers {
-			output = sqlQuery(
+			rows, ok := sqlQuery(
 				`SELECT name FROM sys.servers WHERE is_linked = 1 AND name = {{psq .Name}}`,
 				map[string]any{"Name": name})
-			if output != "" {
+			switch {
+			case !ok:
+				v.addResult(w, "WARN", "MSSQL", fmt.Sprintf("Could not determine linked server %s on %s (host settling?)", name, hostLabel), "")
+			case rows != "":
 				v.addResult(w, "PASS", "MSSQL", fmt.Sprintf("Linked server %s -> %s on %s", name, ls.DataSrc, hostLabel), "")
-			} else {
+			default:
 				v.addResult(w, "FAIL", "MSSQL", fmt.Sprintf("Linked server %s NOT found on %s", name, hostLabel), "")
 			}
 		}
@@ -376,13 +376,47 @@ func (v *Validator) checkMSSQL(ctx context.Context, w io.Writer) {
 	}
 }
 
-type mssqlQueryFn func(sqlTmpl string, vars map[string]any) string
+// mssqlProbe runs a read-only SQL query on host over a local integrated-auth
+// connection and returns the rows (one per line). ok is false only when the
+// probe could not be completed after retries — a host still settling right
+// after provisioning returns empty or truncated stdout even though the WinRM
+// transport reports success. The sqlProbeSentinel, printed only after the
+// query finishes, separates that transient case from a genuine empty result
+// set: a completed query with no rows returns ("", true). Callers should WARN
+// (not FAIL) when ok is false so a healthy lab is never reported as broken.
+func (v *Validator) mssqlProbe(ctx context.Context, host, sqlTmpl string, vars map[string]any) (string, bool) {
+	script := `$c = New-Object System.Data.SqlClient.SqlConnection 'Server=localhost;Integrated Security=True;TrustServerCertificate=True'; ` +
+		`$c.Open(); $cmd = $c.CreateCommand(); ` +
+		`$cmd.CommandText = @"` + "\n" + sqlTmpl + "\n" + `"@; ` +
+		`$r = $cmd.ExecuteReader(); while ($r.Read()) { Write-Output $r[0].ToString() }; $r.Close(); $c.Close(); ` +
+		`Write-Output '` + sqlProbeSentinel + `'`
+	// runPSErr already retries empty-but-successful output (a settling host),
+	// so a single call suffices: a completed query always prints the sentinel
+	// — even with zero rows — so its presence means the result is authoritative
+	// (empty rows = a genuine negative). Its absence means the probe never
+	// completed (transport error, or empty output that outlived the retries):
+	// report ok=false so the caller WARNs instead of emitting a bogus FAIL.
+	out, err := runScriptText(ctx, v, host, script, vars)
+	if err != nil || !strings.Contains(out, sqlProbeSentinel) {
+		return "", false
+	}
+	return strings.TrimSpace(strings.Replace(out, sqlProbeSentinel, "", 1)), true
+}
+
+type mssqlQueryFn func(sqlTmpl string, vars map[string]any) (string, bool)
 
 func (v *Validator) checkMSSQLExtendedFeatures(w io.Writer, sqlQuery mssqlQueryFn, hostLabel string) {
-	output := sqlQuery(
+	xpOut, ok := sqlQuery(
 		`SELECT CONVERT(INT, ISNULL(value, value_in_use)) FROM sys.configurations WHERE name = 'xp_cmdshell'`,
 		nil)
-	xpEnabled := strings.TrimSpace(output) == "1"
+	if !ok {
+		// Couldn't get a definitive answer; the SeImpersonate probe below
+		// depends on xp_cmdshell, so skip the whole group rather than emit a
+		// bogus "NOT enabled".
+		v.addResult(w, "WARN", "MSSQL", fmt.Sprintf("Could not query xp_cmdshell on %s (host settling?)", hostLabel), "")
+		return
+	}
+	xpEnabled := strings.TrimSpace(xpOut) == "1"
 	if xpEnabled {
 		v.addResult(w, "PASS", "MSSQL", fmt.Sprintf("xp_cmdshell enabled on %s", hostLabel), "")
 	} else {
@@ -390,17 +424,21 @@ func (v *Validator) checkMSSQLExtendedFeatures(w io.Writer, sqlQuery mssqlQueryF
 	}
 
 	if xpEnabled {
-		privOut := sqlQuery(`EXEC xp_cmdshell 'whoami /priv'`, nil)
-		if strings.Contains(privOut, "SeImpersonatePrivilege") {
-			v.addResult(w, "PASS", "MSSQL", fmt.Sprintf("MSSQL service has SeImpersonatePrivilege on %s (potato escalation possible)", hostLabel), "")
-		} else if strings.TrimSpace(privOut) != "" {
-			v.addResult(w, "INFO", "MSSQL", fmt.Sprintf("SeImpersonatePrivilege NOT found on MSSQL service on %s", hostLabel), "")
+		if privOut, ok := sqlQuery(`EXEC xp_cmdshell 'whoami /priv'`, nil); ok {
+			if strings.Contains(privOut, "SeImpersonatePrivilege") {
+				v.addResult(w, "PASS", "MSSQL", fmt.Sprintf("MSSQL service has SeImpersonatePrivilege on %s (potato escalation possible)", hostLabel), "")
+			} else if strings.TrimSpace(privOut) != "" {
+				v.addResult(w, "INFO", "MSSQL", fmt.Sprintf("SeImpersonatePrivilege NOT found on MSSQL service on %s", hostLabel), "")
+			}
 		}
 	}
 
-	trustworthy := sqlQuery(
+	trustworthy, ok := sqlQuery(
 		`SELECT name FROM sys.databases WHERE is_trustworthy_on = 1 AND name NOT IN ('master','tempdb')`,
 		nil)
+	if !ok {
+		return
+	}
 	dbs := parseOutputLines(trustworthy)
 	if len(dbs) > 0 {
 		v.addResult(w, "PASS", "MSSQL", fmt.Sprintf("TRUSTWORTHY databases on %s: %s", hostLabel, strings.Join(dbs, ", ")), "")
diff --git a/cli/internal/validate/probe_retry_test.go b/cli/internal/validate/probe_retry_test.go
new file mode 100644
index 00000000..e949bdc9
--- /dev/null
+++ b/cli/internal/validate/probe_retry_test.go
@@ -0,0 +1,214 @@
+package validate
+
+import (
+	"context"
+	"io"
+	"log/slog"
+	"strings"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/dreadnode/dreadgoad/internal/provider"
+)
+
+// stubProvider implements provider.Provider but only RunCommand is live; the
+// rest are no-ops sufficient to satisfy the interface for unit tests.
+type stubProvider struct {
+	run func(call int, command string) (*provider.CommandResult, error)
+	n   atomic.Int64
+}
+
+func (s *stubProvider) RunCommand(_ context.Context, _, command string, _ time.Duration) (*provider.CommandResult, error) {
+	call := int(s.n.Add(1))
+	return s.run(call, command)
+}
+
+func (s *stubProvider) Name() string                                      { return "stub" }
+func (s *stubProvider) VerifyCredentials(context.Context) (string, error) { return "stub", nil }
+func (s *stubProvider) DiscoverInstances(context.Context, string) ([]provider.Instance, error) {
+	return nil, nil
+}
+func (s *stubProvider) DiscoverAllInstances(context.Context, string) ([]provider.Instance, error) {
+	return nil, nil
+}
+func (s *stubProvider) FindInstanceByHostname(context.Context, string, string) (*provider.Instance, error) {
+	return nil, nil
+}
+func (s *stubProvider) StartInstances(context.Context, []string) error       { return nil }
+func (s *stubProvider) StopInstances(context.Context, []string) error        { return nil }
+func (s *stubProvider) WaitForInstanceStopped(context.Context, string) error { return nil }
+func (s *stubProvider) DestroyInstances(context.Context, []string) error     { return nil }
+func (s *stubProvider) RunCommandOnMultiple(context.Context, []string, string, time.Duration) (map[string]*provider.CommandResult, error) {
+	return nil, nil
+}
+func (s *stubProvider) CleanupStaleSessions(context.Context, []string, time.Duration, bool) (int, error) {
+	return 0, nil
+}
+func (s *stubProvider) DescribeActiveSessions(context.Context, string) ([]provider.Session, error) {
+	return nil, nil
+}
+func (s *stubProvider) Drain() {}
+
+func newStubValidator(t *testing.T, run func(call int, command string) (*provider.CommandResult, error)) (*Validator, *stubProvider) {
+	t.Helper()
+	// Shrink the retry backoff so retry paths don't sleep whole seconds.
+	old := backoffBase
+	backoffBase = time.Millisecond
+	t.Cleanup(func() { backoffBase = old })
+
+	prov := &stubProvider{run: run}
+	v := &Validator{
+		provider: prov,
+		log:      slog.New(slog.NewTextHandler(io.Discard, nil)),
+		hosts:    map[string]string{"SRV03": "i-123"},
+	}
+	return v, prov
+}
+
+// A completed query with zero rows must be reported as a genuine negative
+// (ok=true, empty rows) on the very first call — no retries, no false WARN.
+func TestMSSQLProbe_GenuineEmptyIsDefinitive(t *testing.T) {
+	v, prov := newStubValidator(t, func(_ int, _ string) (*provider.CommandResult, error) {
+		// Query ran, returned no rows, but the sentinel still prints.
+		return &provider.CommandResult{Status: "Success", Stdout: sqlProbeSentinel + "\n"}, nil
+	})
+
+	rows, ok := v.mssqlProbe(context.Background(), "SRV03", "SELECT 1 WHERE 1=0", nil)
+	if !ok {
+		t.Fatal("genuine empty result must return ok=true (definitive), got ok=false")
+	}
+	if rows != "" {
+		t.Errorf("expected empty rows, got %q", rows)
+	}
+	if got := prov.n.Load(); got != 1 {
+		t.Errorf("expected exactly 1 call (no retry on definitive result), got %d", got)
+	}
+}
+
+// Empty stdout (sentinel absent) is the post-provision settling signature: the
+// probe must retry and, if it never settles, return ok=false so the caller
+// WARNs instead of emitting a bogus "NOT configured" FAIL.
+func TestMSSQLProbe_TransientEmptyRetriesThenWarns(t *testing.T) {
+	v, prov := newStubValidator(t, func(_ int, _ string) (*provider.CommandResult, error) {
+		return &provider.CommandResult{Status: "Success", Stdout: ""}, nil
+	})
+
+	rows, ok := v.mssqlProbe(context.Background(), "SRV03", "SELECT 1", nil)
+	if ok {
+		t.Fatal("sentinel-absent output must return ok=false, got ok=true")
+	}
+	if rows != "" {
+		t.Errorf("expected empty rows on transient failure, got %q", rows)
+	}
+	if got := prov.n.Load(); got != int64(transientRetries) {
+		t.Errorf("expected %d attempts, got %d", transientRetries, got)
+	}
+}
+
+// A host that settles mid-probe (empty, then a real row with the sentinel)
+// must recover and report the real result with ok=true.
+func TestMSSQLProbe_RecoversAfterSettling(t *testing.T) {
+	v, _ := newStubValidator(t, func(call int, _ string) (*provider.CommandResult, error) {
+		if call < transientRetries {
+			return &provider.CommandResult{Status: "Success", Stdout: ""}, nil
+		}
+		return &provider.CommandResult{Status: "Success", Stdout: "ESSOS\\khal.drogo\n" + sqlProbeSentinel}, nil
+	})
+
+	rows, ok := v.mssqlProbe(context.Background(), "SRV03", "SELECT m.name ...", nil)
+	if !ok {
+		t.Fatal("probe should recover once the host settles, got ok=false")
+	}
+	if rows != `ESSOS\khal.drogo` {
+		t.Errorf("expected row %q, got %q", `ESSOS\khal.drogo`, rows)
+	}
+}
+
+// A transport error (slow/dead host) must NOT trigger the outer retry loop —
+// runPSErr already retried it, and re-running multiplies latency against a
+// host that is timing out. An unknown host makes runPSErr return immediately
+// without ever reaching RunCommand, so the probe must give up after one pass.
+func TestMSSQLProbe_TransportErrorDoesNotAmplify(t *testing.T) {
+	v, prov := newStubValidator(t, func(_ int, _ string) (*provider.CommandResult, error) {
+		t.Fatal("RunCommand must not be reached for an unknown host")
+		return nil, nil
+	})
+
+	rows, ok := v.mssqlProbe(context.Background(), "GHOST", "SELECT 1", nil)
+	if ok {
+		t.Fatal("transport error must return ok=false")
+	}
+	if rows != "" {
+		t.Errorf("expected empty rows, got %q", rows)
+	}
+	if got := prov.n.Load(); got != 0 {
+		t.Errorf("RunCommand should never run for unknown host, got %d calls", got)
+	}
+}
+
+// runPSErr is the central fix: a generic probe whose output is empty-but-
+// successful (a settling host) must be retried so checks across every category
+// — not just MSSQL — stop reading a healthy lab as broken. A definitive result
+// returns immediately without retrying.
+func TestRunPSErr_RetriesEmptySuccessfulOutput(t *testing.T) {
+	v, prov := newStubValidator(t, func(call int, _ string) (*provider.CommandResult, error) {
+		if call < transientRetries {
+			return &provider.CommandResult{Status: "Success", Stdout: "   "}, nil
+		}
+		return &provider.CommandResult{Status: "Success", Stdout: "USER_FOUND"}, nil
+	})
+
+	out, err := v.runPSErr(context.Background(), "SRV03", "Get-ADUser ...")
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if strings.TrimSpace(out) != "USER_FOUND" {
+		t.Errorf("expected recovered output, got %q", out)
+	}
+	if n := prov.n.Load(); n != int64(transientRetries) {
+		t.Errorf("expected %d attempts, got %d", transientRetries, n)
+	}
+}
+
+func TestRunPSErr_NonEmptyReturnsImmediately(t *testing.T) {
+	v, prov := newStubValidator(t, func(_ int, _ string) (*provider.CommandResult, error) {
+		return &provider.CommandResult{Status: "Success", Stdout: "USER_NOT_FOUND"}, nil
+	})
+
+	out, err := v.runPSErr(context.Background(), "SRV03", "Get-ADUser ...")
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if out != "USER_NOT_FOUND" {
+		t.Errorf("expected definitive output, got %q", out)
+	}
+	if n := prov.n.Load(); n != 1 {
+		t.Errorf("a definitive (non-empty) result must not retry, got %d calls", n)
+	}
+}
+
+// runScriptJSON must retry when the JSON envelope is missing (a settling host
+// returns banner-only or empty stdout) and succeed once it appears.
+func TestRunScriptJSON_RetriesUntilEnvelopeAppears(t *testing.T) {
+	type payload struct {
+		X int `json:"x"`
+	}
+	v, prov := newStubValidator(t, func(call int, _ string) (*provider.CommandResult, error) {
+		if call < transientRetries {
+			return &provider.CommandResult{Status: "Success", Stdout: "WinRM banner, no envelope yet"}, nil
+		}
+		return &provider.CommandResult{Status: "Success", Stdout: jsonBegin + `{"x":7}` + jsonEnd}, nil
+	})
+
+	got, err := runScriptJSON[payload](context.Background(), v, "SRV03", "irrelevant", nil)
+	if err != nil {
+		t.Fatalf("expected success after envelope appears, got %v", err)
+	}
+	if got.X != 7 {
+		t.Errorf("expected x=7, got %d", got.X)
+	}
+	if n := prov.n.Load(); n != int64(transientRetries) {
+		t.Errorf("expected %d attempts, got %d", transientRetries, n)
+	}
+}
diff --git a/cli/internal/validate/script_runner.go b/cli/internal/validate/script_runner.go
index 590568e1..e61547cb 100644
--- a/cli/internal/validate/script_runner.go
+++ b/cli/internal/validate/script_runner.go
@@ -8,6 +8,7 @@ import (
 	"fmt"
 	"strings"
 	"text/template"
+	"time"
 )
 
 // jsonBegin/jsonEnd bracket the JSON payload emitted by every embedded
@@ -18,8 +19,37 @@ import (
 const (
 	jsonBegin = "===BEGIN_JSON==="
 	jsonEnd   = "===END_JSON==="
+
+	// sqlProbeSentinel is appended to every MSSQL probe script and printed
+	// only after the query completes. Its presence distinguishes a genuine
+	// empty result set (sentinel present, zero rows) from a transient blip
+	// (sentinel absent: a host still settling right after provisioning
+	// returns truncated or empty stdout even though the transport reported
+	// success). See [Validator.mssqlProbe].
+	sqlProbeSentinel = "__SQLPROBE_OK__"
+
+	// transientRetries is how many times a probe whose output is empty or
+	// missing its expected JSON envelope / SQL sentinel is re-run before
+	// giving up. Envelope and sentinel scripts never legitimately emit empty
+	// output, so retrying here cannot mask a real "thing not found" result —
+	// it only absorbs the post-provision settling window that would otherwise
+	// turn a healthy lab into bogus failures.
+	transientRetries = 3
 )
 
+var backoffBase = time.Second
+
+// backoffSleep waits attempt*backoffBase or returns ctx.Err() if the context
+// is cancelled first.
+func backoffSleep(ctx context.Context, attempt int) error {
+	select {
+	case <-ctx.Done():
+		return ctx.Err()
+	case <-time.After(time.Duration(attempt) * backoffBase):
+		return nil
+	}
+}
+
 // scriptFuncs are the template helpers available to embedded PowerShell
 // scripts via text/template.
 var scriptFuncs = template.FuncMap{
@@ -115,17 +145,38 @@ func runScriptJSON[T any](ctx context.Context, v *Validator, host, tmpl string,
 	if err != nil {
 		return zero, err
 	}
-	raw := v.runPS(ctx, host, script)
-	if raw == "" {
-		return zero, errors.New("empty output (host unreachable or marked dead)")
-	}
-	payload, err := extractJSON(raw)
-	if err != nil {
-		return zero, err
-	}
-	var out T
-	if err := json.Unmarshal(payload, &out); err != nil {
-		return zero, fmt.Errorf("unmarshal payload: %w", err)
+	// A non-empty response that lacks the JSON envelope happens transiently for
+	// these scripts (a host still settling emits a banner or partial stream
+	// before the real payload), never as a legitimate result. Retry that case.
+	// The empty-output case is already handled one layer down by runPSErr, and
+	// transport errors there feed the dead-host machinery, so neither is
+	// retried again here.
+	var lastErr error
+	for attempt := 1; attempt <= transientRetries; attempt++ {
+		raw, err := v.runPSErr(ctx, host, script)
+		if err != nil {
+			return zero, err
+		}
+		if raw == "" {
+			return zero, errors.New("empty output despite successful transport (host settling?)")
+		}
+		payload, perr := extractJSON(raw)
+		if perr != nil {
+			lastErr = perr
+			if attempt < transientRetries {
+				if berr := backoffSleep(ctx, attempt); berr != nil {
+					return zero, berr
+				}
+			}
+			continue
+		}
+		var out T
+		if uerr := json.Unmarshal(payload, &out); uerr != nil {
+			// A malformed envelope is a script bug, not a transient blip —
+			// fail fast rather than retrying pointlessly.
+			return zero, fmt.Errorf("unmarshal payload: %w", uerr)
+		}
+		return out, nil
 	}
-	return out, nil
+	return zero, lastErr
 }
diff --git a/cli/internal/validate/validator.go b/cli/internal/validate/validator.go
index a8a0e056..3fdb4b29 100644
--- a/cli/internal/validate/validator.go
+++ b/cli/internal/validate/validator.go
@@ -322,6 +322,40 @@ func (v *Validator) runPSErr(ctx context.Context, host, command string) (string,
 		return "", fmt.Errorf("host %s marked dead for this run", host)
 	}
 
+	// Absorb empty-but-successful responses: a host still settling right after
+	// provisioning returns blank stdout even though the call itself succeeded,
+	// which probes otherwise read as a real "thing not found" and report as a
+	// bogus failure. Retry such responses (without counting them as transport
+	// failures — the host is up), with backoff. A genuinely empty result
+	// converges to the same blank value, so this only costs a little latency
+	// in the rare case where empty is the real answer. Transport *errors* are
+	// not retried here; runPSTransport already handles those and feeds the
+	// dead-host machinery, so re-running would only amplify latency on a slow
+	// or dead host.
+	for emptyAttempt := 1; emptyAttempt <= transientRetries; emptyAttempt++ {
+		stdout, err := v.runPSTransport(ctx, instanceID, host, command)
+		if err != nil {
+			return "", err
+		}
+		if strings.TrimSpace(stdout) != "" {
+			return stdout, nil
+		}
+		if emptyAttempt < transientRetries {
+			if berr := backoffSleep(ctx, emptyAttempt); berr != nil {
+				return "", berr
+			}
+		}
+	}
+	// Empty after retries: a genuine empty result (caller decides what that
+	// means). Returned with nil error to preserve the existing contract.
+	return "", nil
+}
+
+// runPSTransport runs command once against instanceID, retrying only transient
+// transport failures (SSM/WinRM hiccups, throttles) and marking the host dead
+// after deadThreshold sustained failures. Empty-but-successful output is
+// returned as-is; runPSErr owns the settling-retry policy for that case.
+func (v *Validator) runPSTransport(ctx context.Context, instanceID, host, command string) (string, error) {
 	var lastErr error
 	for attempt := 1; attempt <= runPSAttempts; attempt++ {
 		result, err := v.provider.RunCommand(ctx, instanceID, command, runPSTimeout)

From 908cb8043101dab44b201bc33d0e0fd0fbd0e289 Mon Sep 17 00:00:00 2001
From: Jayson Grace <jayson.e.grace@gmail.com>
Date: Sun, 21 Jun 2026 20:02:31 -0600
Subject: [PATCH 2/2] refactor: add retryMustExist to reduce false FAILs on
 transient AD/share probes (#3)

**Key Changes:**

- Introduced `retryMustExist` to distinguish transient absences (e.g., DC mid-replication, admin shares briefly de-registering) from genuine configuration defects, reducing false FAILs
- Replaced inline PowerShell strings with named constants (`scriptADUserExists`, `scriptADUserWithGroups`, `scriptAdminShares`) that use `-ErrorAction Stop` to prevent silent false negatives
- Admin share checks now emit a WARN instead of silently missing the transport-error case
- Added four targeted unit tests covering all retry branches of `retryMustExist`

**Added:**

- `probeOutcome` type and constants (`probePositive`, `probeNegative`, `probeIncomplete`) to classify probe results with clear semantics - `checks.go`
- `retryMustExist` helper that retries only `probeNegative` outcomes up to `transientRetries` attempts, returning `probeIncomplete` immediately to avoid amplifying latency against slow or dead hosts - `checks.go`
- Named PowerShell script constants (`scriptADUserExists`, `scriptADUserWithGroups`, `scriptAdminShares`) extracted from inline strings, with `scriptAdminShares` updated to exit non-zero on Server service query failure rather than silently returning empty output - `checks.go`
- Unit tests for all `retryMustExist` branches: positive short-circuits on first call, persistent negative is trusted after all retries, transient negative recovers to positive, and incomplete does not retry - `probe_retry_test.go`

**Changed:**

- AD user existence checks in `checkUsernamePasswordEqual` and `checkConfiguredUsers` now wrap the probe in `retryMustExist`, converting the outcome enum back to PASS/FAIL/WARN results and preserving the probe error for WARN messages
- Admin share enumeration in `checkAdminShares` now uses `retryMustExist` with `scriptAdminShares`, adding a missing `default` WARN branch for transport errors that previously had no result emitted
---
 cli/internal/validate/checks.go           | 162 +++++++++++++++++-----
 cli/internal/validate/probe_retry_test.go |  76 ++++++++++
 2 files changed, 204 insertions(+), 34 deletions(-)

diff --git a/cli/internal/validate/checks.go b/cli/internal/validate/checks.go
index 597b26af..5e8a1563 100644
--- a/cli/internal/validate/checks.go
+++ b/cli/internal/validate/checks.go
@@ -15,6 +15,69 @@ func printHeader(w io.Writer, header string) {
 	_, _ = fmt.Fprintf(w, "\n== %s ==\n", header)
 }
 
+// probeOutcome classifies a "must-exist" probe so retryMustExist can tell a
+// genuine negative from a transient one.
+type probeOutcome int
+
+const (
+	probePositive   probeOutcome = iota // the expected thing is present (PASS-worthy)
+	probeNegative                       // the expected thing is genuinely absent (FAIL-worthy)
+	probeIncomplete                     // the probe could not complete (WARN-worthy)
+)
+
+// retryMustExist re-runs a probe for a condition that should always hold in a
+// correctly provisioned lab, returning as soon as it sees probePositive. Only
+// probeNegative is retried: a must-exist entity reported absent by a probe that
+// *completed* is far more often a transient blip (a DC mid-replication, an
+// admin share momentarily de-registered) than a real defect, and because the
+// probe completed the host is responsive, so re-running is cheap. A
+// probeIncomplete (transport error) is returned immediately — runPSErr has
+// already retried it and it feeds dead-host marking, so retrying again here
+// would only amplify latency against a slow or dead host. An outcome that
+// persists across every attempt is trusted, so a genuine defect still surfaces.
+func (v *Validator) retryMustExist(ctx context.Context, probe func() probeOutcome) probeOutcome {
+	var last probeOutcome
+	for attempt := 1; attempt <= transientRetries; attempt++ {
+		last = probe()
+		if last != probeNegative {
+			return last
+		}
+		if attempt < transientRetries {
+			if backoffSleep(ctx, attempt) != nil {
+				return last
+			}
+		}
+	}
+	return last
+}
+
+// scriptADUserExists checks whether an AD user exists, distinguishing a genuine
+// absence (USER_NOT_FOUND) from a transient directory error. Get-ADUser is run
+// with -ErrorAction Stop so a momentary RPC/timeout failure raises instead of
+// being silently swallowed into a false "not found"; only the AD-specific
+// not-found exception yields USER_NOT_FOUND, while any other error exits
+// non-zero and surfaces to the caller as a probe error (WARN, not FAIL).
+const scriptADUserExists = `try { Get-ADUser -Identity {{psq .Username}} -ErrorAction Stop > $null; 'USER_FOUND' } ` +
+	`catch [Microsoft.ActiveDirectory.Management.ADIdentityNotFoundException] { 'USER_NOT_FOUND' } ` +
+	`catch { exit 1 }`
+
+// scriptADUserWithGroups is scriptADUserExists plus the user's group
+// memberships (one GROUP=<dn> line each) for membership assertions.
+const scriptADUserWithGroups = `try { $u = Get-ADUser -Identity {{psq .Username}} -Properties MemberOf -ErrorAction Stop } ` +
+	`catch [Microsoft.ActiveDirectory.Management.ADIdentityNotFoundException] { Write-Output 'USER_NOT_FOUND'; exit 0 } ` +
+	`catch { exit 1 }; ` +
+	`Write-Output 'USER_FOUND'; ` +
+	`foreach ($g in $u.MemberOf) { Write-Output "GROUP=$g" }`
+
+// scriptAdminShares enumerates all SMB shares (failing loudly if the Server
+// service can't be queried) and emits the admin shares that are present. A
+// query error exits non-zero (WARN); a successful enumeration that is missing
+// a share is authoritative for that moment and is retried by the caller, since
+// ADMIN$/C$ can briefly de-register while the Server service settles.
+const scriptAdminShares = `$ErrorActionPreference='Stop'; ` +
+	`try { $s = Get-SmbShare } catch { exit 1 }; ` +
+	`$s | Where-Object { $_.Name -eq 'ADMIN$' -or $_.Name -eq 'C$' } | Select-Object -ExpandProperty Name`
+
 func (v *Validator) checkCredentialDiscovery(ctx context.Context, w io.Writer) {
 	printHeader(w, "Credential Discovery Vulnerabilities")
 
@@ -1542,22 +1605,32 @@ func (v *Validator) checkUsernamePasswordEqual(ctx context.Context, w io.Writer)
 		if !v.hasHost(dcRole) {
 			continue
 		}
-		output, err := runScriptText(ctx, v, dcRole,
-			`$u = Get-ADUser -Identity {{psq .Username}} -ErrorAction SilentlyContinue; if ($u) { 'USER_FOUND' } else { 'USER_NOT_FOUND' }`,
-			map[string]any{"Username": uf.Username})
-		switch {
-		case err != nil:
-			v.addResult(w, "WARN", "Credentials",
-				fmt.Sprintf("Could not verify %s in %s: %v", uf.Username, uf.Domain, err), "")
-		case strings.Contains(output, "USER_FOUND"):
+		var output string
+		var probeErr error
+		outcome := v.retryMustExist(ctx, func() probeOutcome {
+			output, probeErr = runScriptText(ctx, v, dcRole, scriptADUserExists,
+				map[string]any{"Username": uf.Username})
+			switch {
+			case probeErr != nil:
+				return probeIncomplete
+			case strings.Contains(output, "USER_FOUND"):
+				return probePositive
+			case strings.Contains(output, "USER_NOT_FOUND"):
+				return probeNegative
+			default:
+				return probeIncomplete
+			}
+		})
+		switch outcome {
+		case probePositive:
 			v.addResult(w, "PASS", "Credentials",
 				fmt.Sprintf("%s (password=%s) exists in %s", uf.Username, uf.User.Password, uf.Domain), "")
-		case strings.Contains(output, "USER_NOT_FOUND"):
+		case probeNegative:
 			v.addResult(w, "FAIL", "Credentials",
 				fmt.Sprintf("%s does NOT exist in %s (expected weak-cred user)", uf.Username, uf.Domain), "")
 		default:
 			v.addResult(w, "WARN", "Credentials",
-				fmt.Sprintf("Could not verify %s in %s", uf.Username, uf.Domain), "")
+				fmt.Sprintf("Could not verify %s in %s: %v", uf.Username, uf.Domain, probeErr), "")
 		}
 	}
 }
@@ -2766,18 +2839,28 @@ func (v *Validator) checkConfiguredUsers(ctx context.Context, w io.Writer) {
 		if !v.hasHost(dcRole) {
 			continue
 		}
-		output, err := runScriptText(ctx, v, dcRole,
-			`$u = Get-ADUser -Identity {{psq .Username}} -Properties MemberOf -ErrorAction SilentlyContinue; `+
-				`if (-not $u) { Write-Output 'USER_NOT_FOUND'; exit }; `+
-				`Write-Output 'USER_FOUND'; `+
-				`foreach ($g in $u.MemberOf) { Write-Output "GROUP=$g" }`,
-			map[string]any{"Username": uf.Username})
-		if err != nil {
+		var output string
+		var probeErr error
+		outcome := v.retryMustExist(ctx, func() probeOutcome {
+			output, probeErr = runScriptText(ctx, v, dcRole, scriptADUserWithGroups,
+				map[string]any{"Username": uf.Username})
+			switch {
+			case probeErr != nil:
+				return probeIncomplete
+			case strings.Contains(output, "USER_FOUND"):
+				return probePositive
+			case strings.Contains(output, "USER_NOT_FOUND"):
+				return probeNegative
+			default:
+				return probeIncomplete
+			}
+		})
+		switch outcome {
+		case probeIncomplete:
 			v.addResult(w, "WARN", "Users",
-				fmt.Sprintf("Could not verify %s in %s: %v", uf.Username, uf.Domain, err), "")
+				fmt.Sprintf("Could not verify %s in %s: %v", uf.Username, uf.Domain, probeErr), "")
 			continue
-		}
-		if !strings.Contains(output, "USER_FOUND") {
+		case probeNegative:
 			v.addResult(w, "FAIL", "Users",
 				fmt.Sprintf("%s does NOT exist in %s", uf.Username, uf.Domain), "")
 			continue
@@ -3033,26 +3116,37 @@ func (v *Validator) checkAdminShares(ctx context.Context, w io.Writer) {
 		}
 		hostLabel := strings.ToUpper(v.lab.Hostname(role))
 
-		output := v.runPS(ctx, host,
-			`Get-SmbShare -Name ADMIN$,C$ -ErrorAction SilentlyContinue | Select-Object -ExpandProperty Name`)
-		shares := parseOutputLines(output)
-		found := make(map[string]bool, len(shares))
-		for _, s := range shares {
-			found[strings.ToUpper(strings.TrimSpace(s))] = true
-		}
-
 		var missing []string
-		for _, want := range []string{"ADMIN$", "C$"} {
-			if !found[want] {
-				missing = append(missing, want)
+		outcome := v.retryMustExist(ctx, func() probeOutcome {
+			output, err := runScriptText(ctx, v, host, scriptAdminShares, nil)
+			if err != nil {
+				return probeIncomplete
 			}
-		}
-		if len(missing) == 0 {
+			found := make(map[string]bool)
+			for _, s := range parseOutputLines(output) {
+				found[strings.ToUpper(strings.TrimSpace(s))] = true
+			}
+			missing = nil
+			for _, want := range []string{"ADMIN$", "C$"} {
+				if !found[want] {
+					missing = append(missing, want)
+				}
+			}
+			if len(missing) == 0 {
+				return probePositive
+			}
+			return probeNegative
+		})
+		switch outcome {
+		case probePositive:
 			v.addResult(w, "PASS", "AdminShares",
 				fmt.Sprintf("%s exposes ADMIN$ and C$", hostLabel), "")
-		} else {
+		case probeNegative:
 			v.addResult(w, "FAIL", "AdminShares",
 				fmt.Sprintf("%s missing default shares: %s", hostLabel, strings.Join(missing, ", ")), "")
+		default:
+			v.addResult(w, "WARN", "AdminShares",
+				fmt.Sprintf("Could not enumerate shares on %s (host settling?)", hostLabel), "")
 		}
 	}
 }
diff --git a/cli/internal/validate/probe_retry_test.go b/cli/internal/validate/probe_retry_test.go
index e949bdc9..c558667a 100644
--- a/cli/internal/validate/probe_retry_test.go
+++ b/cli/internal/validate/probe_retry_test.go
@@ -188,6 +188,82 @@ func TestRunPSErr_NonEmptyReturnsImmediately(t *testing.T) {
 	}
 }
 
+// retryMustExist must return immediately on a positive result without retrying.
+func TestRetryMustExist_PositiveShortCircuits(t *testing.T) {
+	v, _ := newStubValidator(t, func(int, string) (*provider.CommandResult, error) {
+		return &provider.CommandResult{Status: "Success", Stdout: "x"}, nil
+	})
+	calls := 0
+	out := v.retryMustExist(context.Background(), func() probeOutcome {
+		calls++
+		return probePositive
+	})
+	if out != probePositive {
+		t.Fatalf("expected probePositive, got %d", out)
+	}
+	if calls != 1 {
+		t.Errorf("positive must not retry, got %d calls", calls)
+	}
+}
+
+// A must-exist entity reported absent is retried; a negative that persists
+// across every attempt is ultimately trusted (a genuine misconfiguration).
+func TestRetryMustExist_NegativeRetriedThenTrusted(t *testing.T) {
+	v, _ := newStubValidator(t, func(int, string) (*provider.CommandResult, error) {
+		return &provider.CommandResult{Status: "Success", Stdout: "x"}, nil
+	})
+	calls := 0
+	out := v.retryMustExist(context.Background(), func() probeOutcome {
+		calls++
+		return probeNegative
+	})
+	if out != probeNegative {
+		t.Fatalf("expected probeNegative after retries, got %d", out)
+	}
+	if calls != transientRetries {
+		t.Errorf("expected %d attempts, got %d", transientRetries, calls)
+	}
+}
+
+// A momentary negative that clears (DC mid-replication, share re-registering)
+// must recover to positive rather than reporting a false FAIL.
+func TestRetryMustExist_RecoversFromTransientNegative(t *testing.T) {
+	v, _ := newStubValidator(t, func(int, string) (*provider.CommandResult, error) {
+		return &provider.CommandResult{Status: "Success", Stdout: "x"}, nil
+	})
+	calls := 0
+	out := v.retryMustExist(context.Background(), func() probeOutcome {
+		calls++
+		if calls < transientRetries {
+			return probeNegative
+		}
+		return probePositive
+	})
+	if out != probePositive {
+		t.Fatalf("expected recovery to probePositive, got %d", out)
+	}
+}
+
+// A probeIncomplete (transport error) must NOT be retried — runPSErr already
+// retried it and it feeds dead-host marking, so re-running would amplify
+// latency against a slow or dead host.
+func TestRetryMustExist_IncompleteDoesNotRetry(t *testing.T) {
+	v, _ := newStubValidator(t, func(int, string) (*provider.CommandResult, error) {
+		return &provider.CommandResult{Status: "Success", Stdout: "x"}, nil
+	})
+	calls := 0
+	out := v.retryMustExist(context.Background(), func() probeOutcome {
+		calls++
+		return probeIncomplete
+	})
+	if out != probeIncomplete {
+		t.Fatalf("expected probeIncomplete, got %d", out)
+	}
+	if calls != 1 {
+		t.Errorf("incomplete must not retry (avoid amplifying a dead host), got %d calls", calls)
+	}
+}
+
 // runScriptJSON must retry when the JSON envelope is missing (a settling host
 // returns banner-only or empty stdout) and succeed once it appears.
 func TestRunScriptJSON_RetriesUntilEnvelopeAppears(t *testing.T) {