From b4cfef1c5bfe82d3a8ee92159602dcf0c991d690 Mon Sep 17 00:00:00 2001 From: Jayson Grace Date: Sun, 21 Jun 2026 16:46:27 -0600 Subject: [PATCH 1/2] fix: distinguish transient empty output from genuine empty MSSQL results (#2) **Key Changes:** - Introduced `mssqlProbe` with a sentinel-based protocol to reliably separate a genuine empty SQL result set from a host still settling after provisioning - Added retry logic in `runPSErr` and `runScriptJSON` to absorb transient empty-but-successful responses without masking real "not found" results - Upgraded all MSSQL check call sites to handle the new `(string, bool)` return so probes that cannot complete emit WARN instead of a bogus FAIL - Added comprehensive unit tests covering the sentinel, retry, recovery, and transport-error paths **Added:** - `mssqlProbe` method - new dedicated SQL probe that appends `sqlProbeSentinel` to every query script and uses its presence to distinguish a completed-but-empty result (`ok=true, rows=""`) from a probe that never finished (`ok=false`); callers WARN rather than FAIL on the latter - `sqlProbeSentinel` and `transientRetries` constants plus `backoffBase` variable and `backoffSleep` helper in `script_runner.go` to centralise retry policy and allow tests to shrink backoff to milliseconds - `runPSTransport` method extracted from `runPSErr` to own transport-level retries and dead-host tracking, keeping the two concerns separate - `probe_retry_test.go` - seven focused tests covering: genuine empty is definitive (no retry), transient empty retries then warns, mid-probe recovery, transport error does not amplify retries, `runPSErr` retry behaviour, non-empty returns immediately, and `runScriptJSON` envelope retry **Changed:** - `runPSErr` now wraps `runPSTransport` and adds an outer loop that retries empty-but-successful responses up to `transientRetries` times with backoff before returning a blank result with `nil` error, preserving the existing contract for callers - `runScriptJSON` replaced the single `runPS` call with a retry loop over `runPSErr` that re-runs when the JSON envelope is absent (a transient settling signature) and fails fast only on malformed envelopes, which indicate a script bug rather than a transient blip - `mssqlQueryFn` type signature updated from `func(...) string` to `func(...) (string, bool)` to propagate probe completion status through `checkMSSQLExtendedFeatures` - All three MSSQL check loops (sysadmins, `EXECUTE AS LOGIN`, linked servers) and the `xp_cmdshell`/`TRUSTWORTHY` checks now use `switch`/`if !ok` guards to emit WARN with a `(host settling?)` hint when a probe does not complete, instead of silently treating the empty string as a definitive negative --- cli/internal/validate/checks.go | 96 +++++++--- cli/internal/validate/probe_retry_test.go | 214 ++++++++++++++++++++++ cli/internal/validate/script_runner.go | 75 ++++++-- cli/internal/validate/validator.go | 34 ++++ 4 files changed, 378 insertions(+), 41 deletions(-) create mode 100644 cli/internal/validate/probe_retry_test.go diff --git a/cli/internal/validate/checks.go b/cli/internal/validate/checks.go index 4efe5270..597b26af 100644 --- a/cli/internal/validate/checks.go +++ b/cli/internal/validate/checks.go @@ -320,54 +320,54 @@ func (v *Validator) checkMSSQL(ctx context.Context, w io.Writer) { } v.addResult(w, "PASS", "MSSQL", fmt.Sprintf("MSSQL running on %s", hostLabel), "") - sqlQuery := func(sqlTmpl string, vars map[string]any) string { - out, err := runScriptText(ctx, v, host, - `$c = New-Object System.Data.SqlClient.SqlConnection 'Server=localhost;Integrated Security=True;TrustServerCertificate=True'; `+ - `$c.Open(); $cmd = $c.CreateCommand(); `+ - `$cmd.CommandText = @"`+"\n"+sqlTmpl+"\n"+`"@; `+ - `$r = $cmd.ExecuteReader(); while ($r.Read()) { Write-Output $r[0].ToString() }; $r.Close(); $c.Close()`, - vars) - if err != nil { - return "" - } - return out + sqlQuery := func(sqlTmpl string, vars map[string]any) (string, bool) { + return v.mssqlProbe(ctx, host, sqlTmpl, vars) } for _, admin := range mf.MSSQL.SysAdmins { - output = sqlQuery( + rows, ok := sqlQuery( `SELECT m.name FROM sys.server_role_members srm `+ `JOIN sys.server_principals r ON srm.role_principal_id = r.principal_id `+ `JOIN sys.server_principals m ON srm.member_principal_id = m.principal_id `+ `WHERE r.name = 'sysadmin' AND m.name = {{psq .Admin}}`, map[string]any{"Admin": admin}) - if output != "" { + switch { + case !ok: + v.addResult(w, "WARN", "MSSQL", fmt.Sprintf("Could not determine sysadmin status of %s on %s (host settling?)", admin, hostLabel), "") + case rows != "": v.addResult(w, "PASS", "MSSQL", fmt.Sprintf("%s is sysadmin on %s", admin, hostLabel), "") - } else { + default: v.addResult(w, "FAIL", "MSSQL", fmt.Sprintf("%s is NOT sysadmin on %s", admin, hostLabel), "") } } for grantee, target := range mf.MSSQL.ExecuteAsLogin { - output = sqlQuery( + rows, ok := sqlQuery( `SELECT pr.name FROM sys.server_permissions sp `+ `JOIN sys.server_principals pr ON sp.grantee_principal_id = pr.principal_id `+ `JOIN sys.server_principals pr2 ON sp.major_id = pr2.principal_id `+ `WHERE sp.permission_name = 'IMPERSONATE' AND pr.name = {{psq .Grantee}} AND pr2.name = {{psq .Target}}`, map[string]any{"Grantee": grantee, "Target": target}) - if output != "" { + switch { + case !ok: + v.addResult(w, "WARN", "MSSQL", fmt.Sprintf("Could not determine IMPERSONATE grant for %s->%s on %s (host settling?)", grantee, target, hostLabel), "") + case rows != "": v.addResult(w, "PASS", "MSSQL", fmt.Sprintf("%s can impersonate %s on %s", grantee, target, hostLabel), "") - } else { + default: v.addResult(w, "FAIL", "MSSQL", fmt.Sprintf("%s CANNOT impersonate %s on %s", grantee, target, hostLabel), "") } } for name, ls := range mf.MSSQL.LinkedServers { - output = sqlQuery( + rows, ok := sqlQuery( `SELECT name FROM sys.servers WHERE is_linked = 1 AND name = {{psq .Name}}`, map[string]any{"Name": name}) - if output != "" { + switch { + case !ok: + v.addResult(w, "WARN", "MSSQL", fmt.Sprintf("Could not determine linked server %s on %s (host settling?)", name, hostLabel), "") + case rows != "": v.addResult(w, "PASS", "MSSQL", fmt.Sprintf("Linked server %s -> %s on %s", name, ls.DataSrc, hostLabel), "") - } else { + default: v.addResult(w, "FAIL", "MSSQL", fmt.Sprintf("Linked server %s NOT found on %s", name, hostLabel), "") } } @@ -376,13 +376,47 @@ func (v *Validator) checkMSSQL(ctx context.Context, w io.Writer) { } } -type mssqlQueryFn func(sqlTmpl string, vars map[string]any) string +// mssqlProbe runs a read-only SQL query on host over a local integrated-auth +// connection and returns the rows (one per line). ok is false only when the +// probe could not be completed after retries — a host still settling right +// after provisioning returns empty or truncated stdout even though the WinRM +// transport reports success. The sqlProbeSentinel, printed only after the +// query finishes, separates that transient case from a genuine empty result +// set: a completed query with no rows returns ("", true). Callers should WARN +// (not FAIL) when ok is false so a healthy lab is never reported as broken. +func (v *Validator) mssqlProbe(ctx context.Context, host, sqlTmpl string, vars map[string]any) (string, bool) { + script := `$c = New-Object System.Data.SqlClient.SqlConnection 'Server=localhost;Integrated Security=True;TrustServerCertificate=True'; ` + + `$c.Open(); $cmd = $c.CreateCommand(); ` + + `$cmd.CommandText = @"` + "\n" + sqlTmpl + "\n" + `"@; ` + + `$r = $cmd.ExecuteReader(); while ($r.Read()) { Write-Output $r[0].ToString() }; $r.Close(); $c.Close(); ` + + `Write-Output '` + sqlProbeSentinel + `'` + // runPSErr already retries empty-but-successful output (a settling host), + // so a single call suffices: a completed query always prints the sentinel + // — even with zero rows — so its presence means the result is authoritative + // (empty rows = a genuine negative). Its absence means the probe never + // completed (transport error, or empty output that outlived the retries): + // report ok=false so the caller WARNs instead of emitting a bogus FAIL. + out, err := runScriptText(ctx, v, host, script, vars) + if err != nil || !strings.Contains(out, sqlProbeSentinel) { + return "", false + } + return strings.TrimSpace(strings.Replace(out, sqlProbeSentinel, "", 1)), true +} + +type mssqlQueryFn func(sqlTmpl string, vars map[string]any) (string, bool) func (v *Validator) checkMSSQLExtendedFeatures(w io.Writer, sqlQuery mssqlQueryFn, hostLabel string) { - output := sqlQuery( + xpOut, ok := sqlQuery( `SELECT CONVERT(INT, ISNULL(value, value_in_use)) FROM sys.configurations WHERE name = 'xp_cmdshell'`, nil) - xpEnabled := strings.TrimSpace(output) == "1" + if !ok { + // Couldn't get a definitive answer; the SeImpersonate probe below + // depends on xp_cmdshell, so skip the whole group rather than emit a + // bogus "NOT enabled". + v.addResult(w, "WARN", "MSSQL", fmt.Sprintf("Could not query xp_cmdshell on %s (host settling?)", hostLabel), "") + return + } + xpEnabled := strings.TrimSpace(xpOut) == "1" if xpEnabled { v.addResult(w, "PASS", "MSSQL", fmt.Sprintf("xp_cmdshell enabled on %s", hostLabel), "") } else { @@ -390,17 +424,21 @@ func (v *Validator) checkMSSQLExtendedFeatures(w io.Writer, sqlQuery mssqlQueryF } if xpEnabled { - privOut := sqlQuery(`EXEC xp_cmdshell 'whoami /priv'`, nil) - if strings.Contains(privOut, "SeImpersonatePrivilege") { - v.addResult(w, "PASS", "MSSQL", fmt.Sprintf("MSSQL service has SeImpersonatePrivilege on %s (potato escalation possible)", hostLabel), "") - } else if strings.TrimSpace(privOut) != "" { - v.addResult(w, "INFO", "MSSQL", fmt.Sprintf("SeImpersonatePrivilege NOT found on MSSQL service on %s", hostLabel), "") + if privOut, ok := sqlQuery(`EXEC xp_cmdshell 'whoami /priv'`, nil); ok { + if strings.Contains(privOut, "SeImpersonatePrivilege") { + v.addResult(w, "PASS", "MSSQL", fmt.Sprintf("MSSQL service has SeImpersonatePrivilege on %s (potato escalation possible)", hostLabel), "") + } else if strings.TrimSpace(privOut) != "" { + v.addResult(w, "INFO", "MSSQL", fmt.Sprintf("SeImpersonatePrivilege NOT found on MSSQL service on %s", hostLabel), "") + } } } - trustworthy := sqlQuery( + trustworthy, ok := sqlQuery( `SELECT name FROM sys.databases WHERE is_trustworthy_on = 1 AND name NOT IN ('master','tempdb')`, nil) + if !ok { + return + } dbs := parseOutputLines(trustworthy) if len(dbs) > 0 { v.addResult(w, "PASS", "MSSQL", fmt.Sprintf("TRUSTWORTHY databases on %s: %s", hostLabel, strings.Join(dbs, ", ")), "") diff --git a/cli/internal/validate/probe_retry_test.go b/cli/internal/validate/probe_retry_test.go new file mode 100644 index 00000000..e949bdc9 --- /dev/null +++ b/cli/internal/validate/probe_retry_test.go @@ -0,0 +1,214 @@ +package validate + +import ( + "context" + "io" + "log/slog" + "strings" + "sync/atomic" + "testing" + "time" + + "github.com/dreadnode/dreadgoad/internal/provider" +) + +// stubProvider implements provider.Provider but only RunCommand is live; the +// rest are no-ops sufficient to satisfy the interface for unit tests. +type stubProvider struct { + run func(call int, command string) (*provider.CommandResult, error) + n atomic.Int64 +} + +func (s *stubProvider) RunCommand(_ context.Context, _, command string, _ time.Duration) (*provider.CommandResult, error) { + call := int(s.n.Add(1)) + return s.run(call, command) +} + +func (s *stubProvider) Name() string { return "stub" } +func (s *stubProvider) VerifyCredentials(context.Context) (string, error) { return "stub", nil } +func (s *stubProvider) DiscoverInstances(context.Context, string) ([]provider.Instance, error) { + return nil, nil +} +func (s *stubProvider) DiscoverAllInstances(context.Context, string) ([]provider.Instance, error) { + return nil, nil +} +func (s *stubProvider) FindInstanceByHostname(context.Context, string, string) (*provider.Instance, error) { + return nil, nil +} +func (s *stubProvider) StartInstances(context.Context, []string) error { return nil } +func (s *stubProvider) StopInstances(context.Context, []string) error { return nil } +func (s *stubProvider) WaitForInstanceStopped(context.Context, string) error { return nil } +func (s *stubProvider) DestroyInstances(context.Context, []string) error { return nil } +func (s *stubProvider) RunCommandOnMultiple(context.Context, []string, string, time.Duration) (map[string]*provider.CommandResult, error) { + return nil, nil +} +func (s *stubProvider) CleanupStaleSessions(context.Context, []string, time.Duration, bool) (int, error) { + return 0, nil +} +func (s *stubProvider) DescribeActiveSessions(context.Context, string) ([]provider.Session, error) { + return nil, nil +} +func (s *stubProvider) Drain() {} + +func newStubValidator(t *testing.T, run func(call int, command string) (*provider.CommandResult, error)) (*Validator, *stubProvider) { + t.Helper() + // Shrink the retry backoff so retry paths don't sleep whole seconds. + old := backoffBase + backoffBase = time.Millisecond + t.Cleanup(func() { backoffBase = old }) + + prov := &stubProvider{run: run} + v := &Validator{ + provider: prov, + log: slog.New(slog.NewTextHandler(io.Discard, nil)), + hosts: map[string]string{"SRV03": "i-123"}, + } + return v, prov +} + +// A completed query with zero rows must be reported as a genuine negative +// (ok=true, empty rows) on the very first call — no retries, no false WARN. +func TestMSSQLProbe_GenuineEmptyIsDefinitive(t *testing.T) { + v, prov := newStubValidator(t, func(_ int, _ string) (*provider.CommandResult, error) { + // Query ran, returned no rows, but the sentinel still prints. + return &provider.CommandResult{Status: "Success", Stdout: sqlProbeSentinel + "\n"}, nil + }) + + rows, ok := v.mssqlProbe(context.Background(), "SRV03", "SELECT 1 WHERE 1=0", nil) + if !ok { + t.Fatal("genuine empty result must return ok=true (definitive), got ok=false") + } + if rows != "" { + t.Errorf("expected empty rows, got %q", rows) + } + if got := prov.n.Load(); got != 1 { + t.Errorf("expected exactly 1 call (no retry on definitive result), got %d", got) + } +} + +// Empty stdout (sentinel absent) is the post-provision settling signature: the +// probe must retry and, if it never settles, return ok=false so the caller +// WARNs instead of emitting a bogus "NOT configured" FAIL. +func TestMSSQLProbe_TransientEmptyRetriesThenWarns(t *testing.T) { + v, prov := newStubValidator(t, func(_ int, _ string) (*provider.CommandResult, error) { + return &provider.CommandResult{Status: "Success", Stdout: ""}, nil + }) + + rows, ok := v.mssqlProbe(context.Background(), "SRV03", "SELECT 1", nil) + if ok { + t.Fatal("sentinel-absent output must return ok=false, got ok=true") + } + if rows != "" { + t.Errorf("expected empty rows on transient failure, got %q", rows) + } + if got := prov.n.Load(); got != int64(transientRetries) { + t.Errorf("expected %d attempts, got %d", transientRetries, got) + } +} + +// A host that settles mid-probe (empty, then a real row with the sentinel) +// must recover and report the real result with ok=true. +func TestMSSQLProbe_RecoversAfterSettling(t *testing.T) { + v, _ := newStubValidator(t, func(call int, _ string) (*provider.CommandResult, error) { + if call < transientRetries { + return &provider.CommandResult{Status: "Success", Stdout: ""}, nil + } + return &provider.CommandResult{Status: "Success", Stdout: "ESSOS\\khal.drogo\n" + sqlProbeSentinel}, nil + }) + + rows, ok := v.mssqlProbe(context.Background(), "SRV03", "SELECT m.name ...", nil) + if !ok { + t.Fatal("probe should recover once the host settles, got ok=false") + } + if rows != `ESSOS\khal.drogo` { + t.Errorf("expected row %q, got %q", `ESSOS\khal.drogo`, rows) + } +} + +// A transport error (slow/dead host) must NOT trigger the outer retry loop — +// runPSErr already retried it, and re-running multiplies latency against a +// host that is timing out. An unknown host makes runPSErr return immediately +// without ever reaching RunCommand, so the probe must give up after one pass. +func TestMSSQLProbe_TransportErrorDoesNotAmplify(t *testing.T) { + v, prov := newStubValidator(t, func(_ int, _ string) (*provider.CommandResult, error) { + t.Fatal("RunCommand must not be reached for an unknown host") + return nil, nil + }) + + rows, ok := v.mssqlProbe(context.Background(), "GHOST", "SELECT 1", nil) + if ok { + t.Fatal("transport error must return ok=false") + } + if rows != "" { + t.Errorf("expected empty rows, got %q", rows) + } + if got := prov.n.Load(); got != 0 { + t.Errorf("RunCommand should never run for unknown host, got %d calls", got) + } +} + +// runPSErr is the central fix: a generic probe whose output is empty-but- +// successful (a settling host) must be retried so checks across every category +// — not just MSSQL — stop reading a healthy lab as broken. A definitive result +// returns immediately without retrying. +func TestRunPSErr_RetriesEmptySuccessfulOutput(t *testing.T) { + v, prov := newStubValidator(t, func(call int, _ string) (*provider.CommandResult, error) { + if call < transientRetries { + return &provider.CommandResult{Status: "Success", Stdout: " "}, nil + } + return &provider.CommandResult{Status: "Success", Stdout: "USER_FOUND"}, nil + }) + + out, err := v.runPSErr(context.Background(), "SRV03", "Get-ADUser ...") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if strings.TrimSpace(out) != "USER_FOUND" { + t.Errorf("expected recovered output, got %q", out) + } + if n := prov.n.Load(); n != int64(transientRetries) { + t.Errorf("expected %d attempts, got %d", transientRetries, n) + } +} + +func TestRunPSErr_NonEmptyReturnsImmediately(t *testing.T) { + v, prov := newStubValidator(t, func(_ int, _ string) (*provider.CommandResult, error) { + return &provider.CommandResult{Status: "Success", Stdout: "USER_NOT_FOUND"}, nil + }) + + out, err := v.runPSErr(context.Background(), "SRV03", "Get-ADUser ...") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if out != "USER_NOT_FOUND" { + t.Errorf("expected definitive output, got %q", out) + } + if n := prov.n.Load(); n != 1 { + t.Errorf("a definitive (non-empty) result must not retry, got %d calls", n) + } +} + +// runScriptJSON must retry when the JSON envelope is missing (a settling host +// returns banner-only or empty stdout) and succeed once it appears. +func TestRunScriptJSON_RetriesUntilEnvelopeAppears(t *testing.T) { + type payload struct { + X int `json:"x"` + } + v, prov := newStubValidator(t, func(call int, _ string) (*provider.CommandResult, error) { + if call < transientRetries { + return &provider.CommandResult{Status: "Success", Stdout: "WinRM banner, no envelope yet"}, nil + } + return &provider.CommandResult{Status: "Success", Stdout: jsonBegin + `{"x":7}` + jsonEnd}, nil + }) + + got, err := runScriptJSON[payload](context.Background(), v, "SRV03", "irrelevant", nil) + if err != nil { + t.Fatalf("expected success after envelope appears, got %v", err) + } + if got.X != 7 { + t.Errorf("expected x=7, got %d", got.X) + } + if n := prov.n.Load(); n != int64(transientRetries) { + t.Errorf("expected %d attempts, got %d", transientRetries, n) + } +} diff --git a/cli/internal/validate/script_runner.go b/cli/internal/validate/script_runner.go index 590568e1..e61547cb 100644 --- a/cli/internal/validate/script_runner.go +++ b/cli/internal/validate/script_runner.go @@ -8,6 +8,7 @@ import ( "fmt" "strings" "text/template" + "time" ) // jsonBegin/jsonEnd bracket the JSON payload emitted by every embedded @@ -18,8 +19,37 @@ import ( const ( jsonBegin = "===BEGIN_JSON===" jsonEnd = "===END_JSON===" + + // sqlProbeSentinel is appended to every MSSQL probe script and printed + // only after the query completes. Its presence distinguishes a genuine + // empty result set (sentinel present, zero rows) from a transient blip + // (sentinel absent: a host still settling right after provisioning + // returns truncated or empty stdout even though the transport reported + // success). See [Validator.mssqlProbe]. + sqlProbeSentinel = "__SQLPROBE_OK__" + + // transientRetries is how many times a probe whose output is empty or + // missing its expected JSON envelope / SQL sentinel is re-run before + // giving up. Envelope and sentinel scripts never legitimately emit empty + // output, so retrying here cannot mask a real "thing not found" result — + // it only absorbs the post-provision settling window that would otherwise + // turn a healthy lab into bogus failures. + transientRetries = 3 ) +var backoffBase = time.Second + +// backoffSleep waits attempt*backoffBase or returns ctx.Err() if the context +// is cancelled first. +func backoffSleep(ctx context.Context, attempt int) error { + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(time.Duration(attempt) * backoffBase): + return nil + } +} + // scriptFuncs are the template helpers available to embedded PowerShell // scripts via text/template. var scriptFuncs = template.FuncMap{ @@ -115,17 +145,38 @@ func runScriptJSON[T any](ctx context.Context, v *Validator, host, tmpl string, if err != nil { return zero, err } - raw := v.runPS(ctx, host, script) - if raw == "" { - return zero, errors.New("empty output (host unreachable or marked dead)") - } - payload, err := extractJSON(raw) - if err != nil { - return zero, err - } - var out T - if err := json.Unmarshal(payload, &out); err != nil { - return zero, fmt.Errorf("unmarshal payload: %w", err) + // A non-empty response that lacks the JSON envelope happens transiently for + // these scripts (a host still settling emits a banner or partial stream + // before the real payload), never as a legitimate result. Retry that case. + // The empty-output case is already handled one layer down by runPSErr, and + // transport errors there feed the dead-host machinery, so neither is + // retried again here. + var lastErr error + for attempt := 1; attempt <= transientRetries; attempt++ { + raw, err := v.runPSErr(ctx, host, script) + if err != nil { + return zero, err + } + if raw == "" { + return zero, errors.New("empty output despite successful transport (host settling?)") + } + payload, perr := extractJSON(raw) + if perr != nil { + lastErr = perr + if attempt < transientRetries { + if berr := backoffSleep(ctx, attempt); berr != nil { + return zero, berr + } + } + continue + } + var out T + if uerr := json.Unmarshal(payload, &out); uerr != nil { + // A malformed envelope is a script bug, not a transient blip — + // fail fast rather than retrying pointlessly. + return zero, fmt.Errorf("unmarshal payload: %w", uerr) + } + return out, nil } - return out, nil + return zero, lastErr } diff --git a/cli/internal/validate/validator.go b/cli/internal/validate/validator.go index a8a0e056..3fdb4b29 100644 --- a/cli/internal/validate/validator.go +++ b/cli/internal/validate/validator.go @@ -322,6 +322,40 @@ func (v *Validator) runPSErr(ctx context.Context, host, command string) (string, return "", fmt.Errorf("host %s marked dead for this run", host) } + // Absorb empty-but-successful responses: a host still settling right after + // provisioning returns blank stdout even though the call itself succeeded, + // which probes otherwise read as a real "thing not found" and report as a + // bogus failure. Retry such responses (without counting them as transport + // failures — the host is up), with backoff. A genuinely empty result + // converges to the same blank value, so this only costs a little latency + // in the rare case where empty is the real answer. Transport *errors* are + // not retried here; runPSTransport already handles those and feeds the + // dead-host machinery, so re-running would only amplify latency on a slow + // or dead host. + for emptyAttempt := 1; emptyAttempt <= transientRetries; emptyAttempt++ { + stdout, err := v.runPSTransport(ctx, instanceID, host, command) + if err != nil { + return "", err + } + if strings.TrimSpace(stdout) != "" { + return stdout, nil + } + if emptyAttempt < transientRetries { + if berr := backoffSleep(ctx, emptyAttempt); berr != nil { + return "", berr + } + } + } + // Empty after retries: a genuine empty result (caller decides what that + // means). Returned with nil error to preserve the existing contract. + return "", nil +} + +// runPSTransport runs command once against instanceID, retrying only transient +// transport failures (SSM/WinRM hiccups, throttles) and marking the host dead +// after deadThreshold sustained failures. Empty-but-successful output is +// returned as-is; runPSErr owns the settling-retry policy for that case. +func (v *Validator) runPSTransport(ctx context.Context, instanceID, host, command string) (string, error) { var lastErr error for attempt := 1; attempt <= runPSAttempts; attempt++ { result, err := v.provider.RunCommand(ctx, instanceID, command, runPSTimeout) From 908cb8043101dab44b201bc33d0e0fd0fbd0e289 Mon Sep 17 00:00:00 2001 From: Jayson Grace Date: Sun, 21 Jun 2026 20:02:31 -0600 Subject: [PATCH 2/2] refactor: add retryMustExist to reduce false FAILs on transient AD/share probes (#3) **Key Changes:** - Introduced `retryMustExist` to distinguish transient absences (e.g., DC mid-replication, admin shares briefly de-registering) from genuine configuration defects, reducing false FAILs - Replaced inline PowerShell strings with named constants (`scriptADUserExists`, `scriptADUserWithGroups`, `scriptAdminShares`) that use `-ErrorAction Stop` to prevent silent false negatives - Admin share checks now emit a WARN instead of silently missing the transport-error case - Added four targeted unit tests covering all retry branches of `retryMustExist` **Added:** - `probeOutcome` type and constants (`probePositive`, `probeNegative`, `probeIncomplete`) to classify probe results with clear semantics - `checks.go` - `retryMustExist` helper that retries only `probeNegative` outcomes up to `transientRetries` attempts, returning `probeIncomplete` immediately to avoid amplifying latency against slow or dead hosts - `checks.go` - Named PowerShell script constants (`scriptADUserExists`, `scriptADUserWithGroups`, `scriptAdminShares`) extracted from inline strings, with `scriptAdminShares` updated to exit non-zero on Server service query failure rather than silently returning empty output - `checks.go` - Unit tests for all `retryMustExist` branches: positive short-circuits on first call, persistent negative is trusted after all retries, transient negative recovers to positive, and incomplete does not retry - `probe_retry_test.go` **Changed:** - AD user existence checks in `checkUsernamePasswordEqual` and `checkConfiguredUsers` now wrap the probe in `retryMustExist`, converting the outcome enum back to PASS/FAIL/WARN results and preserving the probe error for WARN messages - Admin share enumeration in `checkAdminShares` now uses `retryMustExist` with `scriptAdminShares`, adding a missing `default` WARN branch for transport errors that previously had no result emitted --- cli/internal/validate/checks.go | 162 +++++++++++++++++----- cli/internal/validate/probe_retry_test.go | 76 ++++++++++ 2 files changed, 204 insertions(+), 34 deletions(-) diff --git a/cli/internal/validate/checks.go b/cli/internal/validate/checks.go index 597b26af..5e8a1563 100644 --- a/cli/internal/validate/checks.go +++ b/cli/internal/validate/checks.go @@ -15,6 +15,69 @@ func printHeader(w io.Writer, header string) { _, _ = fmt.Fprintf(w, "\n== %s ==\n", header) } +// probeOutcome classifies a "must-exist" probe so retryMustExist can tell a +// genuine negative from a transient one. +type probeOutcome int + +const ( + probePositive probeOutcome = iota // the expected thing is present (PASS-worthy) + probeNegative // the expected thing is genuinely absent (FAIL-worthy) + probeIncomplete // the probe could not complete (WARN-worthy) +) + +// retryMustExist re-runs a probe for a condition that should always hold in a +// correctly provisioned lab, returning as soon as it sees probePositive. Only +// probeNegative is retried: a must-exist entity reported absent by a probe that +// *completed* is far more often a transient blip (a DC mid-replication, an +// admin share momentarily de-registered) than a real defect, and because the +// probe completed the host is responsive, so re-running is cheap. A +// probeIncomplete (transport error) is returned immediately — runPSErr has +// already retried it and it feeds dead-host marking, so retrying again here +// would only amplify latency against a slow or dead host. An outcome that +// persists across every attempt is trusted, so a genuine defect still surfaces. +func (v *Validator) retryMustExist(ctx context.Context, probe func() probeOutcome) probeOutcome { + var last probeOutcome + for attempt := 1; attempt <= transientRetries; attempt++ { + last = probe() + if last != probeNegative { + return last + } + if attempt < transientRetries { + if backoffSleep(ctx, attempt) != nil { + return last + } + } + } + return last +} + +// scriptADUserExists checks whether an AD user exists, distinguishing a genuine +// absence (USER_NOT_FOUND) from a transient directory error. Get-ADUser is run +// with -ErrorAction Stop so a momentary RPC/timeout failure raises instead of +// being silently swallowed into a false "not found"; only the AD-specific +// not-found exception yields USER_NOT_FOUND, while any other error exits +// non-zero and surfaces to the caller as a probe error (WARN, not FAIL). +const scriptADUserExists = `try { Get-ADUser -Identity {{psq .Username}} -ErrorAction Stop > $null; 'USER_FOUND' } ` + + `catch [Microsoft.ActiveDirectory.Management.ADIdentityNotFoundException] { 'USER_NOT_FOUND' } ` + + `catch { exit 1 }` + +// scriptADUserWithGroups is scriptADUserExists plus the user's group +// memberships (one GROUP= line each) for membership assertions. +const scriptADUserWithGroups = `try { $u = Get-ADUser -Identity {{psq .Username}} -Properties MemberOf -ErrorAction Stop } ` + + `catch [Microsoft.ActiveDirectory.Management.ADIdentityNotFoundException] { Write-Output 'USER_NOT_FOUND'; exit 0 } ` + + `catch { exit 1 }; ` + + `Write-Output 'USER_FOUND'; ` + + `foreach ($g in $u.MemberOf) { Write-Output "GROUP=$g" }` + +// scriptAdminShares enumerates all SMB shares (failing loudly if the Server +// service can't be queried) and emits the admin shares that are present. A +// query error exits non-zero (WARN); a successful enumeration that is missing +// a share is authoritative for that moment and is retried by the caller, since +// ADMIN$/C$ can briefly de-register while the Server service settles. +const scriptAdminShares = `$ErrorActionPreference='Stop'; ` + + `try { $s = Get-SmbShare } catch { exit 1 }; ` + + `$s | Where-Object { $_.Name -eq 'ADMIN$' -or $_.Name -eq 'C$' } | Select-Object -ExpandProperty Name` + func (v *Validator) checkCredentialDiscovery(ctx context.Context, w io.Writer) { printHeader(w, "Credential Discovery Vulnerabilities") @@ -1542,22 +1605,32 @@ func (v *Validator) checkUsernamePasswordEqual(ctx context.Context, w io.Writer) if !v.hasHost(dcRole) { continue } - output, err := runScriptText(ctx, v, dcRole, - `$u = Get-ADUser -Identity {{psq .Username}} -ErrorAction SilentlyContinue; if ($u) { 'USER_FOUND' } else { 'USER_NOT_FOUND' }`, - map[string]any{"Username": uf.Username}) - switch { - case err != nil: - v.addResult(w, "WARN", "Credentials", - fmt.Sprintf("Could not verify %s in %s: %v", uf.Username, uf.Domain, err), "") - case strings.Contains(output, "USER_FOUND"): + var output string + var probeErr error + outcome := v.retryMustExist(ctx, func() probeOutcome { + output, probeErr = runScriptText(ctx, v, dcRole, scriptADUserExists, + map[string]any{"Username": uf.Username}) + switch { + case probeErr != nil: + return probeIncomplete + case strings.Contains(output, "USER_FOUND"): + return probePositive + case strings.Contains(output, "USER_NOT_FOUND"): + return probeNegative + default: + return probeIncomplete + } + }) + switch outcome { + case probePositive: v.addResult(w, "PASS", "Credentials", fmt.Sprintf("%s (password=%s) exists in %s", uf.Username, uf.User.Password, uf.Domain), "") - case strings.Contains(output, "USER_NOT_FOUND"): + case probeNegative: v.addResult(w, "FAIL", "Credentials", fmt.Sprintf("%s does NOT exist in %s (expected weak-cred user)", uf.Username, uf.Domain), "") default: v.addResult(w, "WARN", "Credentials", - fmt.Sprintf("Could not verify %s in %s", uf.Username, uf.Domain), "") + fmt.Sprintf("Could not verify %s in %s: %v", uf.Username, uf.Domain, probeErr), "") } } } @@ -2766,18 +2839,28 @@ func (v *Validator) checkConfiguredUsers(ctx context.Context, w io.Writer) { if !v.hasHost(dcRole) { continue } - output, err := runScriptText(ctx, v, dcRole, - `$u = Get-ADUser -Identity {{psq .Username}} -Properties MemberOf -ErrorAction SilentlyContinue; `+ - `if (-not $u) { Write-Output 'USER_NOT_FOUND'; exit }; `+ - `Write-Output 'USER_FOUND'; `+ - `foreach ($g in $u.MemberOf) { Write-Output "GROUP=$g" }`, - map[string]any{"Username": uf.Username}) - if err != nil { + var output string + var probeErr error + outcome := v.retryMustExist(ctx, func() probeOutcome { + output, probeErr = runScriptText(ctx, v, dcRole, scriptADUserWithGroups, + map[string]any{"Username": uf.Username}) + switch { + case probeErr != nil: + return probeIncomplete + case strings.Contains(output, "USER_FOUND"): + return probePositive + case strings.Contains(output, "USER_NOT_FOUND"): + return probeNegative + default: + return probeIncomplete + } + }) + switch outcome { + case probeIncomplete: v.addResult(w, "WARN", "Users", - fmt.Sprintf("Could not verify %s in %s: %v", uf.Username, uf.Domain, err), "") + fmt.Sprintf("Could not verify %s in %s: %v", uf.Username, uf.Domain, probeErr), "") continue - } - if !strings.Contains(output, "USER_FOUND") { + case probeNegative: v.addResult(w, "FAIL", "Users", fmt.Sprintf("%s does NOT exist in %s", uf.Username, uf.Domain), "") continue @@ -3033,26 +3116,37 @@ func (v *Validator) checkAdminShares(ctx context.Context, w io.Writer) { } hostLabel := strings.ToUpper(v.lab.Hostname(role)) - output := v.runPS(ctx, host, - `Get-SmbShare -Name ADMIN$,C$ -ErrorAction SilentlyContinue | Select-Object -ExpandProperty Name`) - shares := parseOutputLines(output) - found := make(map[string]bool, len(shares)) - for _, s := range shares { - found[strings.ToUpper(strings.TrimSpace(s))] = true - } - var missing []string - for _, want := range []string{"ADMIN$", "C$"} { - if !found[want] { - missing = append(missing, want) + outcome := v.retryMustExist(ctx, func() probeOutcome { + output, err := runScriptText(ctx, v, host, scriptAdminShares, nil) + if err != nil { + return probeIncomplete } - } - if len(missing) == 0 { + found := make(map[string]bool) + for _, s := range parseOutputLines(output) { + found[strings.ToUpper(strings.TrimSpace(s))] = true + } + missing = nil + for _, want := range []string{"ADMIN$", "C$"} { + if !found[want] { + missing = append(missing, want) + } + } + if len(missing) == 0 { + return probePositive + } + return probeNegative + }) + switch outcome { + case probePositive: v.addResult(w, "PASS", "AdminShares", fmt.Sprintf("%s exposes ADMIN$ and C$", hostLabel), "") - } else { + case probeNegative: v.addResult(w, "FAIL", "AdminShares", fmt.Sprintf("%s missing default shares: %s", hostLabel, strings.Join(missing, ", ")), "") + default: + v.addResult(w, "WARN", "AdminShares", + fmt.Sprintf("Could not enumerate shares on %s (host settling?)", hostLabel), "") } } } diff --git a/cli/internal/validate/probe_retry_test.go b/cli/internal/validate/probe_retry_test.go index e949bdc9..c558667a 100644 --- a/cli/internal/validate/probe_retry_test.go +++ b/cli/internal/validate/probe_retry_test.go @@ -188,6 +188,82 @@ func TestRunPSErr_NonEmptyReturnsImmediately(t *testing.T) { } } +// retryMustExist must return immediately on a positive result without retrying. +func TestRetryMustExist_PositiveShortCircuits(t *testing.T) { + v, _ := newStubValidator(t, func(int, string) (*provider.CommandResult, error) { + return &provider.CommandResult{Status: "Success", Stdout: "x"}, nil + }) + calls := 0 + out := v.retryMustExist(context.Background(), func() probeOutcome { + calls++ + return probePositive + }) + if out != probePositive { + t.Fatalf("expected probePositive, got %d", out) + } + if calls != 1 { + t.Errorf("positive must not retry, got %d calls", calls) + } +} + +// A must-exist entity reported absent is retried; a negative that persists +// across every attempt is ultimately trusted (a genuine misconfiguration). +func TestRetryMustExist_NegativeRetriedThenTrusted(t *testing.T) { + v, _ := newStubValidator(t, func(int, string) (*provider.CommandResult, error) { + return &provider.CommandResult{Status: "Success", Stdout: "x"}, nil + }) + calls := 0 + out := v.retryMustExist(context.Background(), func() probeOutcome { + calls++ + return probeNegative + }) + if out != probeNegative { + t.Fatalf("expected probeNegative after retries, got %d", out) + } + if calls != transientRetries { + t.Errorf("expected %d attempts, got %d", transientRetries, calls) + } +} + +// A momentary negative that clears (DC mid-replication, share re-registering) +// must recover to positive rather than reporting a false FAIL. +func TestRetryMustExist_RecoversFromTransientNegative(t *testing.T) { + v, _ := newStubValidator(t, func(int, string) (*provider.CommandResult, error) { + return &provider.CommandResult{Status: "Success", Stdout: "x"}, nil + }) + calls := 0 + out := v.retryMustExist(context.Background(), func() probeOutcome { + calls++ + if calls < transientRetries { + return probeNegative + } + return probePositive + }) + if out != probePositive { + t.Fatalf("expected recovery to probePositive, got %d", out) + } +} + +// A probeIncomplete (transport error) must NOT be retried — runPSErr already +// retried it and it feeds dead-host marking, so re-running would amplify +// latency against a slow or dead host. +func TestRetryMustExist_IncompleteDoesNotRetry(t *testing.T) { + v, _ := newStubValidator(t, func(int, string) (*provider.CommandResult, error) { + return &provider.CommandResult{Status: "Success", Stdout: "x"}, nil + }) + calls := 0 + out := v.retryMustExist(context.Background(), func() probeOutcome { + calls++ + return probeIncomplete + }) + if out != probeIncomplete { + t.Fatalf("expected probeIncomplete, got %d", out) + } + if calls != 1 { + t.Errorf("incomplete must not retry (avoid amplifying a dead host), got %d calls", calls) + } +} + // runScriptJSON must retry when the JSON envelope is missing (a settling host // returns banner-only or empty stdout) and succeed once it appears. func TestRunScriptJSON_RetriesUntilEnvelopeAppears(t *testing.T) {