From c460c03bd986d5de4617d3f9dc30ed42a55ccd5a Mon Sep 17 00:00:00 2001 From: Dongdong Tao Date: Fri, 5 Jun 2026 23:37:26 +0900 Subject: [PATCH 1/4] refresh-dwarf: runtime-verify each generated JSON via cephadm The refresh bot previously only proved a generated JSON parses and links into the embedded header -- never that the tools actually trace meaningfully through it. Add a parallel per-version runtime verification: - Split the workflow into three jobs: generate -> verify (matrix) -> open-pr. - verify fans out one runner per generated version, rebuilds osdtrace + radostrace with the new JSON embedded, provisions a single-host cephadm cluster on quay.io/ceph/ceph:v (whose ceph-osd build_id matches the el9 RPM the JSON was extracted from), drives an S3 workload, and traces a live OSD + radosgw through the EMBEDDED path. - open-pr includes only versions that passed; failures are dropped from the PR and listed for retry next run. functional-test-cephadm-rgw.sh gains two opt-in knobs (existing matrix behaviour unchanged when unset): - CEPH_IMAGE: pin an exact point-release image instead of the per-major latest. - REQUIRE_EMBEDDED=1: make the 'Using embedded DWARF data' marker mandatory, so a silent fall-back to live DWARF parsing fails the test. --- .github/workflows/refresh-embedded-dwarf.yaml | 263 +++++++++++++++++- tests/functional-test-cephadm-rgw.sh | 45 ++- 2 files changed, 292 insertions(+), 16 deletions(-) diff --git a/.github/workflows/refresh-embedded-dwarf.yaml b/.github/workflows/refresh-embedded-dwarf.yaml index 4952523..093ca46 100644 --- a/.github/workflows/refresh-embedded-dwarf.yaml +++ b/.github/workflows/refresh-embedded-dwarf.yaml @@ -3,8 +3,24 @@ name: Refresh embedded DWARF for new Ceph releases # Phase 1 (this file): centos-stream / el9 only. Detects newly-published # Ceph point releases (quincy / reef / squid / tentacle) on # download.ceph.com, generates osdtrace + radostrace DWARF JSONs for the -# missing ones inside a disposable centos:stream9 podman container, and -# opens a follow-up PR with the new files. +# missing ones inside a disposable centos:stream9 podman container, +# runtime-verifies each one against a real cephadm cluster of the matching +# version, and opens a follow-up PR with only the verified files. +# +# Three jobs: +# generate - detect + generate JSONs (one podman container per version), +# re-aggregate the embedded header as an early link gate, and +# publish the new JSONs + manifest as artifacts. Emits a JSON +# array of the generated versions for the verify matrix. +# verify - dynamic matrix, one runner per generated version (parallel). +# Each cell rebuilds osdtrace/radostrace with the new JSONs +# embedded, provisions a single-host cephadm cluster running +# quay.io/ceph/ceph:v (whose build_id matches the el9 +# RPM the JSON came from), drives an S3 workload, and traces a +# real OSD + radosgw via the EMBEDDED path (REQUIRE_EMBEDDED=1 +# -> a fallback to live DWARF parsing is a failure). +# open-pr - assemble the verified subset (drop any version whose runtime +# verification failed; list it for retry) and open the PR. # # Phases 2-3 (future): mirror the same detect/generate/PR flow for # quay.io container-image build-ids, then for Ubuntu / Cloud Archive / @@ -24,12 +40,18 @@ permissions: pull-requests: write jobs: - refresh: + generate: runs-on: ubuntu-24.04 # Worst case: 15 missing versions * ~6 min/version = 90 min for the # generators alone, plus ~5 min for the host build + ~5 min for the - # final rebuild + PR open. 120 min leaves headroom for slow downloads. + # final rebuild. 120 min leaves headroom for slow downloads. timeout-minutes: 180 + outputs: + count: ${{ steps.detect.outputs.count }} + succeeded: ${{ steps.generate.outputs.succeeded }} + # JSON array of generated versions, e.g. ["17.2.7","19.2.2"]; drives + # the verify job's dynamic matrix. "[]" when nothing was generated. + versions: ${{ steps.generate.outputs.versions }} steps: - name: Checkout code and submodules @@ -89,6 +111,12 @@ jobs: echo "succeeded=$S" >> "$GITHUB_OUTPUT" echo "failed=$F" >> "$GITHUB_OUTPUT" + # Emit the generated versions as a JSON array for the verify matrix. + versions=$(awk -F'\t' 'NF>=3{print $3}' /tmp/succeeded.tsv \ + | python3 -c "import sys,json; print(json.dumps([l.strip() for l in sys.stdin if l.strip()]))") + echo "versions=$versions" >> "$GITHUB_OUTPUT" + echo "versions=$versions" + - name: Re-aggregate embedded DWARF header + relink # This step proves the new JSONs parse cleanly through # tools/generate_embedded_dwarf.py and that osdtrace + radostrace @@ -100,43 +128,240 @@ jobs: make clean make -j"$(nproc)" osdtrace radostrace - - name: Compose pull-request body + - name: Stage generated JSONs + manifest for downstream jobs + if: steps.generate.outputs.succeeded != '0' + run: | + mkdir -p /tmp/artifacts/jsons/osdtrace /tmp/artifacts/jsons/radostrace + while IFS=$'\t' read -r distro tools version pkgver; do + for f in "files/centos-stream/osdtrace/osd-${pkgver}_dwarf.json" \ + "files/centos-stream/radostrace/rados-${pkgver}_dwarf.json"; do + [ -f "$f" ] && cp "$f" "/tmp/artifacts/jsons/${f#files/centos-stream/}" + done + done < /tmp/succeeded.tsv + mkdir -p /tmp/artifacts/manifest + cp /tmp/succeeded.tsv /tmp/failed.tsv /tmp/missing.tsv /tmp/artifacts/manifest/ 2>/dev/null || true + find /tmp/artifacts -type f | sort + + - name: Upload generated JSONs + if: steps.generate.outputs.succeeded != '0' + uses: actions/upload-artifact@v4 + with: + name: generated-jsons + path: /tmp/artifacts/jsons/ + retention-days: 7 + + - name: Upload generation manifest if: steps.generate.outputs.succeeded != '0' + uses: actions/upload-artifact@v4 + with: + name: gen-manifest + path: /tmp/artifacts/manifest/ + retention-days: 7 + + verify: + needs: generate + # Skip entirely when nothing was generated (empty matrix is invalid). + if: ${{ needs.generate.outputs.versions != '' && needs.generate.outputs.versions != '[]' }} + runs-on: ubuntu-24.04 + timeout-minutes: 45 + strategy: + fail-fast: false + matrix: + version: ${{ fromJson(needs.generate.outputs.versions) }} + steps: + - name: Checkout code and submodules + uses: actions/checkout@v4 + with: + submodules: recursive + fetch-depth: 0 + + - name: Install build dependencies + run: | + sudo apt-get update + sudo apt-get install -y g++ clang libelf-dev libc6-dev-i386 libdw-dev python3 + + - name: Pull in the freshly generated JSONs + uses: actions/download-artifact@v4 + with: + name: generated-jsons + path: files/centos-stream + + - name: Build osdtrace + radostrace with the new JSONs embedded + # Rebuilding here (rather than shipping a binary artifact) guarantees + # the embedded header is regenerated to include the just-generated + # version, and that the binary links against this runner's libs. + run: make -j"$(nproc)" osdtrace radostrace + + - name: Runtime trace verification via cephadm (v${{ matrix.version }}) + id: verify + # Pin the exact point-release image so the running ceph-osd / radosgw + # carry the same build_id as the el9 RPM the JSON was extracted from; + # REQUIRE_EMBEDDED=1 makes "Using embedded DWARF data" mandatory, so a + # silent fall-through to live DWARF parsing counts as a failure. + run: | + sudo CEPH_IMAGE="quay.io/ceph/ceph:v${{ matrix.version }}" \ + REQUIRE_EMBEDDED=1 \ + ./tests/functional-test-cephadm-rgw.sh "${{ matrix.version }}" + + - name: Record verification result + if: always() + run: | + mkdir -p /tmp/vr + echo "${{ steps.verify.outcome }}" > "/tmp/vr/${{ matrix.version }}" + echo "version ${{ matrix.version }} -> ${{ steps.verify.outcome }}" + + - name: Upload verification result + if: always() + uses: actions/upload-artifact@v4 + with: + name: verify-result-${{ matrix.version }} + path: /tmp/vr/ + retention-days: 7 + + - name: Stage trace logs for failure artifact + if: failure() run: | + mkdir -p trace-logs + for f in /tmp/osdtrace-cephadm-${{ matrix.version }}.log \ + /tmp/radostrace-cephadm-${{ matrix.version }}.log \ + /tmp/s3-workload-${{ matrix.version }}.log; do + [ -e "$f" ] && sudo cp "$f" trace-logs/ || true + done + sudo chown -R "$USER" trace-logs 2>/dev/null || true + + - name: Upload trace logs + if: failure() + uses: actions/upload-artifact@v4 + with: + name: trace-logs-${{ matrix.version }} + path: trace-logs/ + retention-days: 14 + if-no-files-found: ignore + + open-pr: + needs: [generate, verify] + # Run even if some verify cells failed (we drop those versions); only + # skip when generation produced nothing. + if: ${{ always() && needs.generate.result == 'success' && needs.generate.outputs.succeeded != '0' }} + runs-on: ubuntu-24.04 + timeout-minutes: 15 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Download generated JSONs + uses: actions/download-artifact@v4 + with: + name: generated-jsons + path: /tmp/gen-jsons + + - name: Download generation manifest + uses: actions/download-artifact@v4 + with: + name: gen-manifest + path: /tmp/manifest + + - name: Download verification results + uses: actions/download-artifact@v4 + with: + pattern: verify-result-* + path: /tmp/verify + merge-multiple: true + + - name: Assemble verified JSONs + compose PR body + id: assemble + run: | + # Map version -> success|failure from the verify result files. + declare -A status + if [ -d /tmp/verify ]; then + for f in /tmp/verify/*; do + [ -f "$f" ] || continue + status["$(basename "$f")"]="$(tr -d '[:space:]' < "$f")" + done + fi + + : > /tmp/verified.tsv + : > /tmp/verifyfailed.tsv + copied=0 + if [ -f /tmp/manifest/succeeded.tsv ]; then + while IFS=$'\t' read -r distro tools version pkgver; do + [ -n "$version" ] || continue + st="${status[$version]:-missing}" + if [ "$st" = "success" ]; then + for pair in "osdtrace/osd-${pkgver}_dwarf.json" \ + "radostrace/rados-${pkgver}_dwarf.json"; do + src="/tmp/gen-jsons/${pair}" + dst="files/centos-stream/${pair}" + if [ -f "$src" ]; then + mkdir -p "$(dirname "$dst")" + cp "$src" "$dst" + copied=$((copied + 1)) + fi + done + printf '%s\t%s\t%s\t%s\n' "$distro" "$tools" "$version" "$pkgver" \ + >> /tmp/verified.tsv + else + printf '%s\t%s\t%s\t%s\t%s\n' "$distro" "$tools" "$version" "$pkgver" "$st" \ + >> /tmp/verifyfailed.tsv + echo "::warning::runtime verification failed for $version (status=$st); dropping from PR" + fi + done < /tmp/manifest/succeeded.tsv + fi + echo "copied=$copied" >> "$GITHUB_OUTPUT" + echo "Copied $copied verified JSON file(s) into the working tree." + { - echo "## Newly added embedded DWARF JSONs" + echo "## Newly added embedded DWARF JSONs (runtime-verified)" + echo + echo "Each version below was traced against a live cephadm cluster" + echo "running \`quay.io/ceph/ceph:v\`; both osdtrace and" + echo "radostrace loaded the **embedded** JSON (build_id match) and" + echo "produced valid trace output." echo echo "| distro | tools | version | pkgver |" echo "|---|---|---|---|" while IFS=$'\t' read -r d t v p; do printf '| %s | %s | %s | `%s` |\n' "$d" "$t" "$v" "$p" - done < /tmp/succeeded.tsv + done < /tmp/verified.tsv echo - if [ -s /tmp/failed.tsv ]; then + if [ -s /tmp/verifyfailed.tsv ]; then + echo "## Generated but failed runtime verification" + echo + echo "Excluded from this PR; will be retried by the next run." + echo + echo '```' + cat /tmp/verifyfailed.tsv + echo '```' + echo + fi + if [ -s /tmp/manifest/failed.tsv ]; then echo "## Versions that failed to generate" echo echo "These will be retried by the next scheduled run." echo echo '```' - cat /tmp/failed.tsv + cat /tmp/manifest/failed.tsv echo '```' echo fi - echo "## Verification" + echo "## How this was produced" echo "- \`tools/detect_missing_dwarf.py\` identified the rows above" echo " by probing \`download.ceph.com/rpm-X.Y.Z/el9/x86_64/\`." echo "- Each JSON was generated inside a disposable" echo " \`quay.io/centos/centos:stream9\` container with the" echo " matching ceph-osd + lib*-debuginfo packages installed." - echo "- \`make -j\` re-aggregated the headers and linked" - echo " \`osdtrace\` + \`radostrace\` cleanly." + echo "- Each JSON was runtime-verified by" + echo " \`tests/functional-test-cephadm-rgw.sh\` against a cephadm" + echo " cluster of the matching version (embedded path enforced)." echo - echo "_Generated by \`.github/workflows/refresh-embedded-dwarf.yaml\` ($(date -u +'%Y-%m-%d %H:%MZ'))._" + echo "_Generated by \`.github/workflows/refresh-embedded-dwarf.yaml\`._" } > /tmp/pr_body.md cat /tmp/pr_body.md - name: Open pull request - if: steps.generate.outputs.succeeded != '0' + if: steps.assemble.outputs.copied != '0' uses: peter-evans/create-pull-request@v6 with: branch: chore/refresh-embedded-dwarf-${{ github.run_id }} @@ -146,7 +371,9 @@ jobs: chore: refresh embedded DWARF for new Ceph point releases Auto-generated by the refresh-embedded-dwarf workflow. - See PR body for the list of versions added. + Each JSON was runtime-verified against a cephadm cluster of the + matching version via the embedded-DWARF path. See PR body for + the list of versions added. body-path: /tmp/pr_body.md labels: | dwarf-refresh @@ -154,3 +381,9 @@ jobs: add-paths: | files/centos-stream/osdtrace/*.json files/centos-stream/radostrace/*.json + + - name: Note when nothing was verified + if: steps.assemble.outputs.copied == '0' + run: | + echo "No version passed runtime verification; no PR opened." + echo "See the 'verify' job matrix for per-version failures." diff --git a/tests/functional-test-cephadm-rgw.sh b/tests/functional-test-cephadm-rgw.sh index bea8a48..474cbe5 100755 --- a/tests/functional-test-cephadm-rgw.sh +++ b/tests/functional-test-cephadm-rgw.sh @@ -119,7 +119,16 @@ fi ############################################################################ info "=== Step 2: install cephadm + resolve image for $CEPH_RELEASE ===" -CEPH_IMG=$(cephadm_image_for_release "$CEPH_RELEASE") +# CEPH_IMAGE lets a caller pin an exact point-release image +# (e.g. quay.io/ceph/ceph:v19.2.2) instead of the per-major "latest" +# default. The embedded-DWARF refresh bot uses this to run the verification +# against the precise version it just generated a JSON for, so the build_id +# matches and the embedded path actually engages. +if [ -n "${CEPH_IMAGE:-}" ]; then + CEPH_IMG="$CEPH_IMAGE" +else + CEPH_IMG=$(cephadm_image_for_release "$CEPH_RELEASE") +fi info "image: $CEPH_IMG" install_cephadm "$CEPH_RELEASE" /tmp/cephadm "$CEPH_IMG" @@ -291,6 +300,40 @@ kill "$WL_PUT" "$WL_GET" 2>/dev/null || true sleep 2 +############################################################################ +info "=== Step 13b: check embedded-DWARF boot marker ===" +# When REQUIRE_EMBEDDED=1 (the refresh bot's per-version verification), the +# whole point is to prove the *embedded* JSON works: a fallback to live DWARF +# parsing means the embedded data was never exercised, so treat it as a hard +# failure. Without the flag (the normal PR matrix) the marker is advisory -- +# the major-release "latest" image may legitimately predate or postdate the +# embedded data, and live-parse fallback is an acceptable optimisation miss. +check_embedded_marker() { + local tool="$1" log="$2" + if grep -q "Using embedded DWARF data" "$log"; then + info "✓ $tool used embedded DWARF data" + return 0 + fi + if [ "${REQUIRE_EMBEDDED:-0}" = "1" ]; then + err "$tool did NOT use embedded DWARF data (REQUIRE_EMBEDDED=1)" + if grep -q "Start to parse dwarf info" "$log"; then + err " -> it fell back to live DWARF parsing; the embedded JSON for this version was not matched (build_id mismatch?)" + else + err " -> neither embedded nor live-parse marker present; tool may have failed to start" + fi + return 1 + fi + if grep -q "Start to parse dwarf info" "$log"; then + info "[NOTE] $tool fell back to live DWARF parsing (embedded data not matched in this env)" + else + info "[NOTE] $tool: no embedded/live-parse marker found" + fi + return 0 +} +check_embedded_marker osdtrace "$OSDTRACE_LOG" +check_embedded_marker radostrace "$RADOSTRACE_LOG" + + ############################################################################ info "=== Step 14: gather cluster facts for verifiers ===" # osdtrace counts data rows targeting a single pool; pick the RGW data From f9284a7ef605462908d091b47f7bdb18b914d278 Mon Sep 17 00:00:00 2001 From: Dongdong Tao Date: Sat, 6 Jun 2026 11:18:42 +0900 Subject: [PATCH 2/4] refresh-dwarf: ship generated JSONs as a tarball artifact actions/upload-artifact@v4 rejects ':' in file paths, and the JSON filenames embed the package epoch (osd-2:19.2.2-0.el9_dwarf.json), so the raw-file upload failed. Bundle the new JSONs into a colon-free tarball (alongside the manifest TSVs) in generate, and untar in the verify and open-pr jobs. No other logic change. --- .github/workflows/refresh-embedded-dwarf.yaml | 57 +++++++++---------- 1 file changed, 28 insertions(+), 29 deletions(-) diff --git a/.github/workflows/refresh-embedded-dwarf.yaml b/.github/workflows/refresh-embedded-dwarf.yaml index 093ca46..5f05464 100644 --- a/.github/workflows/refresh-embedded-dwarf.yaml +++ b/.github/workflows/refresh-embedded-dwarf.yaml @@ -128,34 +128,31 @@ jobs: make clean make -j"$(nproc)" osdtrace radostrace - - name: Stage generated JSONs + manifest for downstream jobs + - name: Bundle generated JSONs + manifest for downstream jobs if: steps.generate.outputs.succeeded != '0' run: | - mkdir -p /tmp/artifacts/jsons/osdtrace /tmp/artifacts/jsons/radostrace + # The JSON filenames embed the package epoch (e.g. + # osd-2:19.2.2-0.el9_dwarf.json) and actions/upload-artifact@v4 + # rejects the ':' character outright, so ship a colon-free tarball + # (plus the colon-free manifest TSVs) rather than the raw files. + mkdir -p /tmp/artifacts + : > /tmp/artifacts/filelist.txt while IFS=$'\t' read -r distro tools version pkgver; do for f in "files/centos-stream/osdtrace/osd-${pkgver}_dwarf.json" \ "files/centos-stream/radostrace/rados-${pkgver}_dwarf.json"; do - [ -f "$f" ] && cp "$f" "/tmp/artifacts/jsons/${f#files/centos-stream/}" + [ -f "$f" ] && echo "$f" >> /tmp/artifacts/filelist.txt done done < /tmp/succeeded.tsv - mkdir -p /tmp/artifacts/manifest - cp /tmp/succeeded.tsv /tmp/failed.tsv /tmp/missing.tsv /tmp/artifacts/manifest/ 2>/dev/null || true - find /tmp/artifacts -type f | sort + tar -czf /tmp/artifacts/generated-jsons.tar.gz -T /tmp/artifacts/filelist.txt + cp /tmp/succeeded.tsv /tmp/failed.tsv /tmp/missing.tsv /tmp/artifacts/ 2>/dev/null || true + echo "=== tarball contents ==="; tar -tzf /tmp/artifacts/generated-jsons.tar.gz - - name: Upload generated JSONs + - name: Upload generated JSONs + manifest if: steps.generate.outputs.succeeded != '0' uses: actions/upload-artifact@v4 with: name: generated-jsons - path: /tmp/artifacts/jsons/ - retention-days: 7 - - - name: Upload generation manifest - if: steps.generate.outputs.succeeded != '0' - uses: actions/upload-artifact@v4 - with: - name: gen-manifest - path: /tmp/artifacts/manifest/ + path: /tmp/artifacts/ retention-days: 7 verify: @@ -184,7 +181,10 @@ jobs: uses: actions/download-artifact@v4 with: name: generated-jsons - path: files/centos-stream + path: /tmp/gen + + - name: Unpack generated JSONs into the tree + run: tar -xzf /tmp/gen/generated-jsons.tar.gz - name: Build osdtrace + radostrace with the new JSONs embedded # Rebuilding here (rather than shipping a binary artifact) guarantees @@ -251,17 +251,16 @@ jobs: with: fetch-depth: 0 - - name: Download generated JSONs + - name: Download generated JSONs + manifest uses: actions/download-artifact@v4 with: name: generated-jsons - path: /tmp/gen-jsons + path: /tmp/gen - - name: Download generation manifest - uses: actions/download-artifact@v4 - with: - name: gen-manifest - path: /tmp/manifest + - name: Unpack generated JSONs + run: | + mkdir -p /tmp/gen-extract + tar -xzf /tmp/gen/generated-jsons.tar.gz -C /tmp/gen-extract - name: Download verification results uses: actions/download-artifact@v4 @@ -285,14 +284,14 @@ jobs: : > /tmp/verified.tsv : > /tmp/verifyfailed.tsv copied=0 - if [ -f /tmp/manifest/succeeded.tsv ]; then + if [ -f /tmp/gen/succeeded.tsv ]; then while IFS=$'\t' read -r distro tools version pkgver; do [ -n "$version" ] || continue st="${status[$version]:-missing}" if [ "$st" = "success" ]; then for pair in "osdtrace/osd-${pkgver}_dwarf.json" \ "radostrace/rados-${pkgver}_dwarf.json"; do - src="/tmp/gen-jsons/${pair}" + src="/tmp/gen-extract/files/centos-stream/${pair}" dst="files/centos-stream/${pair}" if [ -f "$src" ]; then mkdir -p "$(dirname "$dst")" @@ -307,7 +306,7 @@ jobs: >> /tmp/verifyfailed.tsv echo "::warning::runtime verification failed for $version (status=$st); dropping from PR" fi - done < /tmp/manifest/succeeded.tsv + done < /tmp/gen/succeeded.tsv fi echo "copied=$copied" >> "$GITHUB_OUTPUT" echo "Copied $copied verified JSON file(s) into the working tree." @@ -336,13 +335,13 @@ jobs: echo '```' echo fi - if [ -s /tmp/manifest/failed.tsv ]; then + if [ -s /tmp/gen/failed.tsv ]; then echo "## Versions that failed to generate" echo echo "These will be retried by the next scheduled run." echo echo '```' - cat /tmp/manifest/failed.tsv + cat /tmp/gen/failed.tsv echo '```' echo fi From a80e1ed999e0a8cd938fd59e34b9a45829cbc83d Mon Sep 17 00:00:00 2001 From: Dongdong Tao Date: Sat, 6 Jun 2026 17:01:07 +0900 Subject: [PATCH 3/4] tests/cephadm: retry bootstrap to survive the orch-backend race `cephadm bootstrap` intermittently aborts at its final step: it restarts the mgr to load the cephadm module, then immediately runs `orch set backend cephadm`. When the orchestrator module has not finished loading yet, that command fails with `Error ENOTSUP ... Module 'orchestrator' is not enabled/loaded` and the whole bootstrap fails. This flaked the tentacle (v20.2.1) verify cells two ways: - ubuntu-24.04: newer cephadm auto-rolled back the partial cluster, so the FSID probe came up empty -> "bootstrap failed to produce FSID". - ubuntu-22.04: the older apt-installed cephadm did NOT roll back, leaving a partial /var/lib/ceph/ with no orchestrator backend. The script proceeded, OSDs were never scheduled, and wait_cephadm_healthy burned the full 900s timeout. Make cluster launch robust: retry the bootstrap up to CEPHADM_BOOTSTRAP_ATTEMPTS times (default 3), purging any partial cluster (rm-cluster + rm -rf) between attempts so stale mon/mgr containers don't hold ports and the FSID probe can't latch onto a broken cluster id. On total failure the helper returns non-zero and the caller fails fast instead of timing out later against a backend-less cluster. Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/functional-test-cephadm-rgw.sh | 2 +- tests/lib/cephadm-setup.sh | 79 ++++++++++++++++++++++++---- 2 files changed, 69 insertions(+), 12 deletions(-) diff --git a/tests/functional-test-cephadm-rgw.sh b/tests/functional-test-cephadm-rgw.sh index 474cbe5..514326c 100755 --- a/tests/functional-test-cephadm-rgw.sh +++ b/tests/functional-test-cephadm-rgw.sh @@ -144,7 +144,7 @@ info "OSD devices: ${OSD_DEVS[*]}" info "=== Step 4: bootstrap single-host cephadm cluster ===" MON_IP=$(hostname -I | awk '{print $1}') info "MON_IP=$MON_IP" -FSID=$(cephadm_bootstrap_single_host "$CEPH_IMG" "$MON_IP" /tmp/cephadm) +FSID=$(cephadm_bootstrap_single_host "$CEPH_IMG" "$MON_IP" /tmp/cephadm) || true [ -n "$FSID" ] || { err "bootstrap failed to produce FSID"; exit 1; } info "FSID=$FSID" diff --git a/tests/lib/cephadm-setup.sh b/tests/lib/cephadm-setup.sh index dfb9051..2a66497 100755 --- a/tests/lib/cephadm-setup.sh +++ b/tests/lib/cephadm-setup.sh @@ -115,14 +115,49 @@ provision_loopback_osds() { } +# _purge_partial_clusters [cephadm_bin] +# +# Remove every cluster currently under /var/lib/ceph so the next bootstrap +# attempt starts from a clean slate. A failed `cephadm bootstrap` does NOT +# reliably roll itself back: newer cephadm builds auto-delete on failure, +# but the older apt-installed cephadm on Ubuntu 22.04 leaves the partial +# /var/lib/ceph/ (plus its mon/mgr containers and systemd units) +# behind. Left in place that debris (a) makes the retry's port checks fail +# because the old mon/mgr still hold the ports, and (b) fools the +# `ls | head -1` FSID detection into echoing a stale, broken cluster id. +_purge_partial_clusters() { + local cephadm_bin="${1:-cephadm}" + local fsid + for fsid in $(ls /var/lib/ceph/ 2>/dev/null | grep -E '^[0-9a-f]{8}-[0-9a-f-]+$'); do + info "purging leftover cluster $fsid before bootstrap" + "$cephadm_bin" rm-cluster --fsid "$fsid" --force --zap-osds >&2 2>/dev/null || true + rm -rf "/var/lib/ceph/$fsid" 2>/dev/null || true + done +} + + # cephadm_bootstrap_single_host [cephadm_bin] # # Bootstrap the cluster and echo the new FSID. --single-host-defaults # relaxes the no-single-host warnings; --skip-mon-network avoids requiring # a real network range; --allow-overwrite lets the test be idempotent # across retries on the same runner. +# +# Bootstrap is retried up to CEPHADM_BOOTSTRAP_ATTEMPTS times (default 3). +# `cephadm bootstrap` has a known transient race near the end: it restarts +# the mgr to load the cephadm module, then immediately runs +# `orch set backend cephadm`. If the orchestrator module has not finished +# loading yet, that command fails with +# Error ENOTSUP ... Module 'orchestrator' is not enabled/loaded +# and the whole bootstrap aborts. It is not resumable, so the robust +# response is to purge the half-built cluster and bootstrap again from +# scratch. Returns non-zero (echoing nothing) if every attempt fails, so +# the caller fails fast instead of proceeding against a backend-less cluster +# and timing out later while waiting for OSDs that can never be scheduled. cephadm_bootstrap_single_host() { local image="$1"; local mon_ip="$2"; local cephadm_bin="${3:-/tmp/cephadm}" + local max_attempts="${CEPHADM_BOOTSTRAP_ATTEMPTS:-3}" + local retry_delay="${CEPHADM_BOOTSTRAP_RETRY_DELAY:-15}" # --cluster-network is intentionally omitted: it requires a *network* # address (e.g. 10.0.0.0/24), not a host address, and the rejection # message is unhelpful ("has host bits set"). For a single-host cluster @@ -135,17 +170,39 @@ cephadm_bootstrap_single_host() { # --no-cleanup-on-failure would let us inspect a partial bootstrap, but # it was added later in the quincy line and the apt-installed cephadm # on Ubuntu 22.04 rejects it. CI doesn't need the inspection anyway — - # the default (auto-cleanup on failed bootstrap) is what we want there. - "$cephadm_bin" --image "$image" bootstrap \ - --mon-ip "$mon_ip" \ - --skip-mon-network \ - --skip-firewalld \ - --skip-dashboard \ - --single-host-defaults \ - --allow-overwrite \ - --allow-mismatched-release \ - >&2 - ls /var/lib/ceph/ 2>/dev/null | grep -E '^[0-9a-f]{8}-[0-9a-f-]+$' | head -1 + # we purge partial clusters ourselves between attempts (see below). + local attempt rc + for (( attempt=1; attempt<=max_attempts; attempt++ )); do + # Always start from a clean slate: clears any debris left by a + # previous failed attempt (or a stale cluster from an earlier run on + # the same self-hosted runner). + _purge_partial_clusters "$cephadm_bin" + + rc=0 + "$cephadm_bin" --image "$image" bootstrap \ + --mon-ip "$mon_ip" \ + --skip-mon-network \ + --skip-firewalld \ + --skip-dashboard \ + --single-host-defaults \ + --allow-overwrite \ + --allow-mismatched-release \ + >&2 || rc=$? + + if [[ $rc -eq 0 ]]; then + ls /var/lib/ceph/ 2>/dev/null | grep -E '^[0-9a-f]{8}-[0-9a-f-]+$' | head -1 + return 0 + fi + + err "cephadm bootstrap attempt ${attempt}/${max_attempts} failed (rc=$rc)" + if (( attempt < max_attempts )); then + info "retrying bootstrap in ${retry_delay}s ..." + sleep "$retry_delay" + fi + done + + err "cephadm bootstrap failed after ${max_attempts} attempts" + return 1 } From e1f37971771830c21680a076a576e9804479a465 Mon Sep 17 00:00:00 2001 From: Dongdong Tao Date: Sat, 6 Jun 2026 17:29:09 +0900 Subject: [PATCH 4/4] tests/cephadm: judge bootstrap by orchestrator state, not exit code The previous retry commit keyed success off the bootstrap exit code, which broke the quincy cell (it had been passing). Two issues: 1. cephadm bootstrap's exit code conflates opposite outcomes. Bootstrapping an older image (quincy/reef) with a newer host cephadm tries to deploy services the image lacks -- `orch apply ceph-exporter` fails with EINVAL on quincy -- so bootstrap prints "Bootstrap complete." and leaves a fully working cluster, yet still exits non-zero (234). The old code ignored the exit code entirely (set -e is suppressed inside FSID=$(...)), so it passed then; my rc==0 gate wrongly rejected the good cluster and retried 3x. Now success is judged by the cluster's real state: an FSID exists AND the orchestrator backend is live (`ceph orch status` works). That accepts the benign quincy mismatch while still rejecting the tentacle orch-backend race (no backend -> retry), which is the property the downstream `ceph orch apply` for OSDs/RGW actually needs. 2. info() prints to stdout, and this helper's stdout is captured as the FSID by the caller. Any progress line (purge/retry/accept) leaked into the captured value -- e.g. "ERROR: not an fsid: INFO: retrying bootstrap...". Route every info() inside the capture to stderr; emit only the bare FSID on stdout. Validated the stdout-only-FSID invariant locally across happy / quincy (benign rc=234) / tentacle-retry / hard-fail scenarios. Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/lib/cephadm-setup.sh | 69 ++++++++++++++++++++++++++++---------- 1 file changed, 52 insertions(+), 17 deletions(-) diff --git a/tests/lib/cephadm-setup.sh b/tests/lib/cephadm-setup.sh index 2a66497..2591c47 100755 --- a/tests/lib/cephadm-setup.sh +++ b/tests/lib/cephadm-setup.sh @@ -129,13 +129,30 @@ _purge_partial_clusters() { local cephadm_bin="${1:-cephadm}" local fsid for fsid in $(ls /var/lib/ceph/ 2>/dev/null | grep -E '^[0-9a-f]{8}-[0-9a-f-]+$'); do - info "purging leftover cluster $fsid before bootstrap" + # NOTE: info() prints to stdout; this helper runs inside the + # FSID=$(cephadm_bootstrap_single_host ...) capture, so route every + # progress line to stderr or it pollutes the captured FSID. + info "purging leftover cluster $fsid before bootstrap" >&2 "$cephadm_bin" rm-cluster --fsid "$fsid" --force --zap-osds >&2 2>/dev/null || true rm -rf "/var/lib/ceph/$fsid" 2>/dev/null || true done } +# _orch_backend_ready +# +# True iff the orchestrator backend is live in the freshly bootstrapped +# cluster — i.e. `ceph orch status` works. This is the property that +# actually matters: the test next runs `ceph orch apply` for OSDs/RGW, which +# needs a working cephadm backend. We check this rather than the bootstrap +# exit code because that code conflates two very different outcomes (see +# cephadm_bootstrap_single_host). +_orch_backend_ready() { + local fsid="$1" + cephadm shell --fsid "$fsid" -- ceph orch status >/dev/null 2>&1 +} + + # cephadm_bootstrap_single_host [cephadm_bin] # # Bootstrap the cluster and echo the new FSID. --single-host-defaults @@ -143,17 +160,28 @@ _purge_partial_clusters() { # a real network range; --allow-overwrite lets the test be idempotent # across retries on the same runner. # -# Bootstrap is retried up to CEPHADM_BOOTSTRAP_ATTEMPTS times (default 3). -# `cephadm bootstrap` has a known transient race near the end: it restarts -# the mgr to load the cephadm module, then immediately runs -# `orch set backend cephadm`. If the orchestrator module has not finished -# loading yet, that command fails with -# Error ENOTSUP ... Module 'orchestrator' is not enabled/loaded -# and the whole bootstrap aborts. It is not resumable, so the robust -# response is to purge the half-built cluster and bootstrap again from -# scratch. Returns non-zero (echoing nothing) if every attempt fails, so -# the caller fails fast instead of proceeding against a backend-less cluster -# and timing out later while waiting for OSDs that can never be scheduled. +# Bootstrap is retried up to CEPHADM_BOOTSTRAP_ATTEMPTS times (default 3), +# and success is judged by the cluster's actual state, NOT by the bootstrap +# exit code. That distinction matters because the exit code conflates two +# opposite outcomes: +# +# * Transient, fatal (must retry): a race near the end of bootstrap — the +# mgr restarts to load the cephadm module, then `orch set backend +# cephadm` runs before the orchestrator module finishes loading and +# fails with `Error ENOTSUP ... Module 'orchestrator' is not +# enabled/loaded`. The whole bootstrap aborts and leaves no working +# backend. Seen on tentacle (v20.2.x). +# * Benign, non-fatal (must accept): bootstrapping an older image (quincy, +# reef) with a newer host cephadm tries to deploy services the image +# does not know — e.g. `orch apply ceph-exporter` fails with EINVAL on +# quincy. cephadm logs it, prints "Bootstrap complete.", and leaves a +# fully functional cluster, yet still exits non-zero. +# +# So after each attempt we keep the cluster only if its orchestrator backend +# is actually live (_orch_backend_ready); otherwise we purge it and retry. +# Returns non-zero (echoing nothing) if every attempt fails, so the caller +# fails fast instead of proceeding against a backend-less cluster and timing +# out later while waiting for OSDs that can never be scheduled. cephadm_bootstrap_single_host() { local image="$1"; local mon_ip="$2"; local cephadm_bin="${3:-/tmp/cephadm}" local max_attempts="${CEPHADM_BOOTSTRAP_ATTEMPTS:-3}" @@ -171,7 +199,7 @@ cephadm_bootstrap_single_host() { # it was added later in the quincy line and the apt-installed cephadm # on Ubuntu 22.04 rejects it. CI doesn't need the inspection anyway — # we purge partial clusters ourselves between attempts (see below). - local attempt rc + local attempt rc fsid for (( attempt=1; attempt<=max_attempts; attempt++ )); do # Always start from a clean slate: clears any debris left by a # previous failed attempt (or a stale cluster from an earlier run on @@ -189,14 +217,21 @@ cephadm_bootstrap_single_host() { --allow-mismatched-release \ >&2 || rc=$? - if [[ $rc -eq 0 ]]; then - ls /var/lib/ceph/ 2>/dev/null | grep -E '^[0-9a-f]{8}-[0-9a-f-]+$' | head -1 + fsid=$(ls /var/lib/ceph/ 2>/dev/null | grep -E '^[0-9a-f]{8}-[0-9a-f-]+$' | head -1) + if [[ -n "$fsid" ]] && _orch_backend_ready "$fsid"; then + # Cluster is up with a working orchestrator backend; a non-zero + # rc here is the benign service-apply mismatch described above. + # info() goes to stdout, which this function's caller captures as + # the FSID — keep all progress on stderr and emit ONLY the bare + # FSID on stdout. + [[ $rc -eq 0 ]] || info "bootstrap exited rc=$rc but orchestrator backend is live — accepting cluster $fsid" >&2 + echo "$fsid" return 0 fi - err "cephadm bootstrap attempt ${attempt}/${max_attempts} failed (rc=$rc)" + err "cephadm bootstrap attempt ${attempt}/${max_attempts} failed (rc=$rc, orchestrator backend not ready)" if (( attempt < max_attempts )); then - info "retrying bootstrap in ${retry_delay}s ..." + info "retrying bootstrap in ${retry_delay}s ..." >&2 sleep "$retry_delay" fi done