diff --git a/pkg/drbd/drbdadm.go b/pkg/drbd/drbdadm.go index c33988a1..9d15fd4b 100644 --- a/pkg/drbd/drbdadm.go +++ b/pkg/drbd/drbdadm.go @@ -20,12 +20,22 @@ import ( "context" "fmt" "strings" + "time" "github.com/cockroachdb/errors" "github.com/cozystack/blockstor/pkg/storage" ) +// DownForceTimeout caps how long DownForce waits for the happy-path +// `drbdadm down ` before falling back to the kernel-direct +// teardown sequence. 15s is the budget the satellite daemonset's +// terminationGracePeriod (30s) and per-resource preStop budget can +// both afford — long enough for a healthy peer to ack the +// disconnect, short enough that a suspended-quorum slot can't stall +// the deletion path indefinitely (Bug 82). +const DownForceTimeout = 15 * time.Second + // Adm is a thin wrapper around the `drbdadm` CLI. It exists so the // satellite reconciler can be unit-tested without a real DRBD kernel // module present: production injects storage.RealExec, tests inject @@ -51,6 +61,75 @@ func (a *Adm) Down(ctx context.Context, resource string) error { return a.run(ctx, "down", resource) } +// DownForce tears the resource down with a bounded timeout, falling +// back to the kernel-direct sequence when the happy-path +// `drbdadm down ` doesn't return in time (Bug 82). +// +// Why the happy-path can hang: when DRBD loses quorum +// (`suspended:quorum`) mid-delete, `drbdadm down` blocks waiting +// for the resource to leave the suspended state, which only +// resolves if a peer comes back. Operators see this as +// `kubectl delete pod` hanging Terminating for many minutes and +// only unblocking under `--force`, leaving a stale +// `/dev/drbd` slot that breaks the next resource allocated +// the same port→minor pair (Bug 82 dev-stand observation). +// +// Flow: +// +// 1. Run `drbdadm down ` under a child context with deadline = +// DownForceTimeout. If it returns within the budget (success or +// a non-hang error), we're done — the kernel slot is gone. +// 2. On timeout / error: for each volume, force-detach the lower +// disk via `drbdsetup detach --force /`. detach does +// NOT wait for quorum, so this returns even when the resource +// is suspended. +// 3. Finalise with `drbdsetup down ` (kernel-direct slot +// release, bypasses drbdadm's quorum-aware wait). Returns nil +// on a "missing slot" exit — the goal state (slot absent) is +// the same. +// +// volumeNumbers is the set of DRBD volume slots to force-detach; +// callers source it from VolumeDefinitions. An empty slice skips +// the per-volume detach and relies on `drbdsetup down` alone — +// belt-and-braces for kernels where `down` won't proceed past a +// transient disk-attached state. +// +// Returns nil on success. Returns a wrapped error only when BOTH +// the bounded `drbdadm down` AND the kernel-direct `drbdsetup +// down` failed. +func (a *Adm) DownForce(ctx context.Context, resource string, volumeNumbers []int32) error { + downCtx, cancel := context.WithTimeout(ctx, DownForceTimeout) + defer cancel() + + err := a.run(downCtx, "down", resource) + if err == nil { + return nil + } + + // Fallback path: force-detach every volume (no quorum wait), + // then kernel-direct `drbdsetup down`. Use the parent ctx so a + // tighter caller deadline still applies. + for _, vol := range volumeNumbers { + target := fmt.Sprintf("%s/%d", resource, vol) + _, _ = a.exec.Run(ctx, "drbdsetup", "detach", "--force", target) + } + + _, downErr := a.exec.Run(ctx, "drbdsetup", "down", resource) + if downErr == nil { + return nil + } + + // `drbdsetup down` on a missing slot exits non-zero with + // "No such resource" / "Unknown resource" — same goal state. + if strings.Contains(downErr.Error(), "No such resource") || + strings.Contains(downErr.Error(), "No currently configured DRBD") || + strings.Contains(downErr.Error(), "Unknown resource") { + return nil + } + + return errors.Wrapf(downErr, "drbdsetup down %s after drbdadm down failed (%v)", resource, err) +} + // Adjust reconciles kernel state to the on-disk .res file. Called after // the ConfFileBuilder writes a new file and we need DRBD to pick up // changes (added/removed peers, new options). diff --git a/pkg/drbd/drbdadm_test.go b/pkg/drbd/drbdadm_test.go index ca600aa3..6f2898d1 100644 --- a/pkg/drbd/drbdadm_test.go +++ b/pkg/drbd/drbdadm_test.go @@ -17,6 +17,7 @@ limitations under the License. package drbd_test import ( + "context" "errors" "fmt" "slices" @@ -58,6 +59,180 @@ func TestAdmDownInvokesDrbdadm(t *testing.T) { } } +// TestAdmDownForceHappyPath: when `drbdadm down ` returns +// successfully under the budget, DownForce stops after one shell-out +// and does NOT invoke the kernel-direct fallback. Pins Bug 82's +// happy-path expectation — the new bounded teardown stays a pure +// `drbdadm down` on healthy resources so we don't churn netlink +// state on every delete. +func TestAdmDownForceHappyPath(t *testing.T) { + fx := storage.NewFakeExec() + adm := drbd.NewAdm(fx) + + if err := adm.DownForce(t.Context(), "pvc-1", []int32{0, 1}); err != nil { + t.Fatalf("DownForce: %v", err) + } + + calls := fx.CommandLines() + if !slices.Contains(calls, "drbdadm down pvc-1") { + t.Errorf("missing drbdadm down call: %v", calls) + } + + // Fallback path MUST NOT run when drbdadm down succeeded. + for _, line := range calls { + if line == "drbdsetup detach --force pvc-1/0" || + line == "drbdsetup detach --force pvc-1/1" || + line == "drbdsetup down pvc-1" { + t.Errorf("fallback path ran on happy success: %q in %v", line, calls) + } + } +} + +// errSuspendedQuorum is the static error used as a surrogate for +// "drbdadm down hung on suspended:quorum" in the DownForce fallback +// tests. DownForce only matches substrings against the drbdsetup +// error text (not drbdadm), so this message just needs to make the +// happy path fail; the exact wording is irrelevant. +var errSuspendedQuorum = errors.New("test: resource is suspended: quorum") + +// errNoSuchResource is the static error used as a surrogate for +// drbdsetup down's "missing slot" exit. DownForce matches the +// title-case substring "No such resource" against the drbdsetup +// error text, so we embed that exact phrase here (the leading +// "test:" satisfies the ST1005 "no leading capital" lint without +// hiding the substring DownForce's matcher looks for). +var errNoSuchResource = errors.New("test: No such resource") + +// TestAdmDownForceFallback pins the Bug 82 recovery: when +// `drbdadm down` fails (suspended-quorum surrogate: we just inject a +// generic error), DownForce must invoke `drbdsetup detach --force` +// for every volume and then `drbdsetup down `, and must return +// nil because the goal state (kernel slot released) was reached via +// the fallback path. +func TestAdmDownForceFallback(t *testing.T) { + fx := storage.NewFakeExec() + // Make `drbdadm down` fail; this is the surrogate for the + // real-world "suspended:quorum hung the call until ctx deadline" + // — the wrapper's contract is identical either way (error from + // the bounded sub-context triggers fallback). + fx.Expect("drbdadm down pvc-suspended", storage.FakeResponse{ + Err: errSuspendedQuorum, + }) + + adm := drbd.NewAdm(fx) + + err := adm.DownForce(t.Context(), "pvc-suspended", []int32{0, 1}) + if err != nil { + t.Fatalf("DownForce: expected nil after fallback recovery, got %v", err) + } + + calls := fx.CommandLines() + + want := []string{ + "drbdadm down pvc-suspended", + "drbdsetup detach --force pvc-suspended/0", + "drbdsetup detach --force pvc-suspended/1", + "drbdsetup down pvc-suspended", + } + for _, w := range want { + if !slices.Contains(calls, w) { + t.Errorf("missing %q in fallback sequence: %v", w, calls) + } + } + + // Ordering: every detach must precede the final drbdsetup down, + // otherwise the kernel slot release runs before lower disks are + // released and the suspended-state retry stalls again. + downIdx := slices.Index(calls, "drbdsetup down pvc-suspended") + for _, vol := range []string{"pvc-suspended/0", "pvc-suspended/1"} { + detachIdx := slices.Index(calls, "drbdsetup detach --force "+vol) + if detachIdx < 0 || downIdx < 0 || detachIdx > downIdx { + t.Errorf("detach %s must precede drbdsetup down; got %v", vol, calls) + } + } +} + +// TestAdmDownForceFallbackMissingSlot: when `drbdsetup down` reports +// the resource is already gone ("No such resource"), DownForce returns +// nil — the goal state is "kernel slot absent", and a missing slot +// satisfies it. Without this branch, an idempotent re-issue (second +// handleDelete pass after a partial first) would surface as an error. +func TestAdmDownForceFallbackMissingSlot(t *testing.T) { + fx := storage.NewFakeExec() + fx.Expect("drbdadm down pvc-gone", storage.FakeResponse{ + Err: errSuspendedQuorum, + }) + fx.Expect("drbdsetup down pvc-gone", storage.FakeResponse{ + Err: errNoSuchResource, + }) + + adm := drbd.NewAdm(fx) + + err := adm.DownForce(t.Context(), "pvc-gone", []int32{0}) + if err != nil { + t.Fatalf("DownForce: missing-slot must be a clean no-op, got %v", err) + } +} + +// TestAdmDownForceBoundedDeadline guarantees the happy-path call +// receives a context with a deadline set by DownForceTimeout — this +// is the load-bearing invariant of the Bug 82 fix. The fake exec +// inspects ctx.Deadline() on the recorded call to verify the +// timeout was applied; without a deadline, a suspended-quorum +// resource would block the satellite finalizer-strip path +// indefinitely. +func TestAdmDownForceBoundedDeadline(t *testing.T) { + var ( + deadlineSeen bool + hasDeadline bool + ) + + deadlineFx := &deadlineCapturingExec{ + fx: storage.NewFakeExec(), + onCall: func(name string, _ []string, hasDl bool) { + if name == "drbdadm" { + deadlineSeen = true + hasDeadline = hasDl + } + }, + } + + adm := drbd.NewAdm(deadlineFx) + if err := adm.DownForce(t.Context(), "pvc-1", nil); err != nil { + t.Fatalf("DownForce: %v", err) + } + + if !deadlineSeen { + t.Fatal("drbdadm down was never invoked") + } + + if !hasDeadline { + t.Error("DownForce must invoke drbdadm down with a deadline (Bug 82 bounded teardown)") + } +} + +// deadlineCapturingExec wraps a FakeExec to inspect ctx.Deadline() +// per call — needed by TestAdmDownForceBoundedDeadline because +// FakeExec drops the context. +type deadlineCapturingExec struct { + fx *storage.FakeExec + onCall func(name string, args []string, hasDeadline bool) +} + +func (d *deadlineCapturingExec) Run(ctx context.Context, name string, args ...string) ([]byte, error) { + _, hasDeadline := ctx.Deadline() + if d.onCall != nil { + d.onCall(name, args, hasDeadline) + } + + out, err := d.fx.Run(ctx, name, args...) + if err != nil { + return out, fmt.Errorf("fake exec %s: %w", name, err) + } + + return out, nil +} + // TestAdmAdjustInvokesDrbdadm: Adjust → `drbdadm adjust `. This is // the reload-on-config-change path; runs after the .res file is rewritten. func TestAdmAdjustInvokesDrbdadm(t *testing.T) { diff --git a/pkg/satellite/reconciler.go b/pkg/satellite/reconciler.go index db4a6374..c9740116 100644 --- a/pkg/satellite/reconciler.go +++ b/pkg/satellite/reconciler.go @@ -278,24 +278,39 @@ func (r *Reconciler) DeleteSnapshot(ctx context.Context, req *intent.DeleteSnaps return &intent.DeleteSnapshotResponse{Ok: true}, nil } -// DeleteResource tears down a resource: drbdadm down (best-effort — -// the kernel handles a missing one fine), DeleteVolume on every -// requested volume_number through the named Provider, then remove -// the .res file. Idempotent on a missing resource. Per-step errors -// land in the response body so the controller can surface granular -// status without aborting the rest of the cleanup. +// DeleteResource tears down a resource: bounded drbdadm down with a +// kernel-direct `drbdsetup detach`+`drbdsetup down` fallback when the +// happy path is wedged by suspended-quorum (Bug 82), DeleteVolume on +// every requested volume_number through the named Provider, then +// remove the .res file. Idempotent on a missing resource. Per-step +// errors land in the response body so the controller can surface +// granular status without aborting the rest of the cleanup. +// +// Bug 82: the previous `Adm.Down` call had no timeout, so deleting an +// RD whose kernel slot was `suspended:quorum` would block the entire +// handleDelete → satellite finalizer-strip path indefinitely. With +// the satellite finalizer stuck, operators force-stripped it and the +// kernel slot survived as a `/dev/drbd` orphan that prevented +// the next port→minor allocation from creating its metadata. The +// bounded `DownForce` path collapses both failure modes: happy path +// returns within DownForceTimeout; quorum-stuck path falls back to +// `drbdsetup detach --force` per volume + `drbdsetup down` (neither +// command waits for quorum). Either way the kernel slot is gone +// before we return, and finalizer-strip becomes safe again. func (r *Reconciler) DeleteResource(ctx context.Context, req *intent.DeleteResourceRequest) (*intent.DeleteResourceResponse, error) { var downMsg string if r.cfg.Adm != nil { - err := r.cfg.Adm.Down(ctx, req.GetName()) + err := r.cfg.Adm.DownForce(ctx, req.GetName(), req.GetVolumeNumbers()) if err != nil { - // Best-effort: a "not configured" error is fine here - // (resource was already torn down on a prior pass), but we - // still want to surface the message back to the controller - // so it shows up in the gRPC response. Don't fail — DRBD - // down errors shouldn't block the storage cleanup. - downMsg = "drbdadm down: " + err.Error() + // Best-effort: a "not configured" / "no such resource" + // error here is fine (resource was already torn down on a + // prior pass), but we still want to surface the message + // back to the controller so it shows up in the response. + // Don't fail — even a fully-failed DownForce shouldn't + // block storage cleanup; the storage sweeper is the + // wider-scope backstop. + downMsg = "drbdadm down (force): " + err.Error() } } diff --git a/pkg/satellite/reconciler_drbd_test.go b/pkg/satellite/reconciler_drbd_test.go index 22661e7f..80c724c6 100644 --- a/pkg/satellite/reconciler_drbd_test.go +++ b/pkg/satellite/reconciler_drbd_test.go @@ -40,6 +40,7 @@ var ( errDrbdadmAdjustFail = errors.New("drbdadm: simulated mid-Apply abort") errDrbdadmResizeFail = errors.New("drbdadm: resize failed (peer disconnected)") errDrbdsetupNoResource = errors.New("drbdsetup: exit status 10") + errDrbdadmDownStuck = errors.New("drbdadm down: suspended:quorum") ) // TestApplyWritesResFile: Apply leaves a /etc/drbd.d/.res file @@ -1373,6 +1374,76 @@ func TestDeleteResourceClosesLUKSMapper(t *testing.T) { } } +// TestDeleteResourceForceTeardownOnSuspendedQuorum pins the Bug 82 +// fix: when `drbdadm down` fails (surrogate for the production hang +// on suspended:quorum), DeleteResource must follow up with +// `drbdsetup detach --force /` for every volume and +// `drbdsetup down `. The DRBD kernel slot survives the original +// `drbdadm down` hang otherwise, and the next port→minor allocation +// runs into `Device '' is configured! exit 20`. The response +// stays Ok=true because the kernel slot is released via the +// fallback path; storage cleanup proceeds; the satellite finalizer +// gets released by handleDelete unconditionally. +func TestDeleteResourceForceTeardownOnSuspendedQuorum(t *testing.T) { + dir := t.TempDir() + fx := storage.NewFakeExec() + // Wedge `drbdadm down` so DownForce flips to the kernel-direct + // fallback. In production this manifests as an indefinite hang + // that DownForce's context deadline turns into an error; the + // FakeExec surrogate just produces an immediate error, which the + // wrapper handles via the same fallback branch. + fx.Expect("drbdadm down pvc-stuck", storage.FakeResponse{Err: errDrbdadmDownStuck}) + + // Storage probe so DeleteVolume is a clean no-op (resource has + // no thin provider attached in this scenario — keep the test + // focused on the DRBD teardown path). + rec := satellite.NewReconciler(satellite.ReconcilerConfig{ + Adm: drbd.NewAdm(fx), + StateDir: dir, + NodeName: "n1", + }) + + resp, err := rec.DeleteResource(t.Context(), &intent.DeleteResourceRequest{ + Name: "pvc-stuck", + VolumeNumbers: []int32{0, 1}, + }) + if err != nil { + t.Fatalf("DeleteResource: %v", err) + } + + // Ok=true even though drbdadm down failed — DownForce's + // fallback released the kernel slot, and storage cleanup is + // allowed to proceed. + if !resp.GetOk() { + t.Fatalf("DeleteResource Ok=false: %s", resp.GetMessage()) + } + + calls := fx.CommandLines() + + want := []string{ + "drbdadm down pvc-stuck", + "drbdsetup detach --force pvc-stuck/0", + "drbdsetup detach --force pvc-stuck/1", + "drbdsetup down pvc-stuck", + } + for _, w := range want { + if !slices.Contains(calls, w) { + t.Errorf("missing %q in fallback sequence: %v", w, calls) + } + } + + // Every per-volume detach must precede the final drbdsetup down + // — bringing the slot down before the lower disks are released + // keeps the suspended-state path re-triggerable. + downIdx := slices.Index(calls, "drbdsetup down pvc-stuck") + for _, vol := range []string{"pvc-stuck/0", "pvc-stuck/1"} { + detachIdx := slices.Index(calls, "drbdsetup detach --force "+vol) + if detachIdx < 0 || downIdx < 0 || detachIdx > downIdx { + t.Errorf("detach %s must precede drbdsetup down; got %v", vol, calls) + } + } +} + // TestApplyConvergesAfterMidApplyAbort: simulates a hard satellite kill // (SIGKILL of the daemonset pod) between applyStorage and applyDRBD. // On the first Apply, drbdadm adjust fails — equivalent to "got SIGKILL diff --git a/stand/blockstor-satellite-daemonset.yaml b/stand/blockstor-satellite-daemonset.yaml index 45f6a86d..8d263d58 100644 --- a/stand/blockstor-satellite-daemonset.yaml +++ b/stand/blockstor-satellite-daemonset.yaml @@ -126,15 +126,32 @@ spec: securityContext: privileged: true # PreStop hook: gracefully release DRBD-9 connections - # before SIGTERM. Without this, `kubectl delete pod` - # (no --force) leaves drbd kernel state with stuck - # `Connecting` peers, and the kernel module is - # host-level on the Talos node so it persists across - # pod restarts. `drbdadm down --all` walks /etc/drbd.d - # in order; the per-resource 5s budget keeps a single - # half-hung connection from blocking the whole drain. - # Wrapped in `timeout 25` so the overall hook stays - # under terminationGracePeriodSeconds (30). + # before SIGTERM with a kernel-direct fallback (Bug 82). + # + # Without this, `kubectl delete pod` (no --force) leaves + # drbd kernel state with stuck `Connecting` peers, and the + # kernel module is host-level on the Talos node so it + # persists across pod restarts. + # + # Bug 82: `drbdadm down ` blocks indefinitely on + # `suspended:quorum` slots — a resource mid-quorum-loss + # (controller deleted the RD while a peer was offline) + # will hang `drbdadm down` until a peer comes back. The + # old hook timed out the per-resource call at 5s, leaving + # the kernel slot loaded. Next satellite start sees the + # stale `/dev/drbd` and a recycled port→minor pair + # fails create-md with "Device '' is configured!" + # + # New flow per resource: + # 1. Try `timeout 5 drbdadm down`. Happy path. + # 2. On failure / timeout: force-detach each volume via + # `drbdsetup detach --force /` (no quorum + # wait) so the lower disk is released. + # 3. `drbdsetup down ` kernel-direct slot release + # (also bypasses drbdadm's quorum-aware wait). + # + # Wrapped in `timeout 25` so the overall hook stays under + # terminationGracePeriodSeconds (30). lifecycle: preStop: exec: @@ -144,10 +161,31 @@ spec: - | set +e timeout 25 sh -c ' - for r in $(drbdsetup status --json 2>/dev/null \ - | python3 -c "import json,sys; print(\" \".join(r[\"name\"] for r in json.load(sys.stdin)))" 2>/dev/null); do - timeout 5 drbdadm down "$r" 2>/dev/null || true - done + drbdsetup status --json 2>/dev/null \ + | python3 -c " + import json, sys + try: + rs = json.load(sys.stdin) + except Exception: + sys.exit(0) + for r in rs: + name = r.get(\"name\", \"\") + vols = [str(v.get(\"volume\", 0)) for v in r.get(\"volumes\", [])] + print(name + \"\\t\" + \",\".join(vols)) + " 2>/dev/null \ + | while IFS=$(printf "\t") read -r r vols; do + [ -z "$r" ] && continue + timeout 5 drbdadm down "$r" 2>/dev/null && continue + # Fallback: kernel-direct teardown for + # suspended:quorum-stuck slots. + IFS=, + for v in $vols; do + [ -z "$v" ] && continue + drbdsetup detach --force "$r/$v" 2>/dev/null || true + done + unset IFS + drbdsetup down "$r" 2>/dev/null || true + done ' volumeMounts: - {name: dev, mountPath: /dev}