From c460c03bd986d5de4617d3f9dc30ed42a55ccd5a Mon Sep 17 00:00:00 2001
From: Dongdong Tao <dongdong.tao@canonical.com>
Date: Fri, 5 Jun 2026 23:37:26 +0900
Subject: [PATCH 1/4] refresh-dwarf: runtime-verify each generated JSON via
 cephadm

The refresh bot previously only proved a generated JSON parses and links
into the embedded header -- never that the tools actually trace meaningfully
through it.  Add a parallel per-version runtime verification:

- Split the workflow into three jobs: generate -> verify (matrix) -> open-pr.
- verify fans out one runner per generated version, rebuilds osdtrace +
  radostrace with the new JSON embedded, provisions a single-host cephadm
  cluster on quay.io/ceph/ceph:v<version> (whose ceph-osd build_id matches
  the el9 RPM the JSON was extracted from), drives an S3 workload, and traces
  a live OSD + radosgw through the EMBEDDED path.
- open-pr includes only versions that passed; failures are dropped from the
  PR and listed for retry next run.

functional-test-cephadm-rgw.sh gains two opt-in knobs (existing matrix
behaviour unchanged when unset):
- CEPH_IMAGE: pin an exact point-release image instead of the per-major latest.
- REQUIRE_EMBEDDED=1: make the 'Using embedded DWARF data' marker mandatory,
  so a silent fall-back to live DWARF parsing fails the test.
---
 .github/workflows/refresh-embedded-dwarf.yaml | 263 +++++++++++++++++-
 tests/functional-test-cephadm-rgw.sh          |  45 ++-
 2 files changed, 292 insertions(+), 16 deletions(-)
diff --git a/.github/workflows/refresh-embedded-dwarf.yaml b/.github/workflows/refresh-embedded-dwarf.yaml
index 4952523..093ca46 100644
--- a/.github/workflows/refresh-embedded-dwarf.yaml
+++ b/.github/workflows/refresh-embedded-dwarf.yaml
@@ -3,8 +3,24 @@ name: Refresh embedded DWARF for new Ceph releases
 # Phase 1 (this file): centos-stream / el9 only.  Detects newly-published
 # Ceph point releases (quincy / reef / squid / tentacle) on
 # download.ceph.com, generates osdtrace + radostrace DWARF JSONs for the
-# missing ones inside a disposable centos:stream9 podman container, and
-# opens a follow-up PR with the new files.
+# missing ones inside a disposable centos:stream9 podman container,
+# runtime-verifies each one against a real cephadm cluster of the matching
+# version, and opens a follow-up PR with only the verified files.
+#
+# Three jobs:
+#   generate  - detect + generate JSONs (one podman container per version),
+#               re-aggregate the embedded header as an early link gate, and
+#               publish the new JSONs + manifest as artifacts.  Emits a JSON
+#               array of the generated versions for the verify matrix.
+#   verify    - dynamic matrix, one runner per generated version (parallel).
+#               Each cell rebuilds osdtrace/radostrace with the new JSONs
+#               embedded, provisions a single-host cephadm cluster running
+#               quay.io/ceph/ceph:v<version> (whose build_id matches the el9
+#               RPM the JSON came from), drives an S3 workload, and traces a
+#               real OSD + radosgw via the EMBEDDED path (REQUIRE_EMBEDDED=1
+#               -> a fallback to live DWARF parsing is a failure).
+#   open-pr   - assemble the verified subset (drop any version whose runtime
+#               verification failed; list it for retry) and open the PR.
 #
 # Phases 2-3 (future): mirror the same detect/generate/PR flow for
 # quay.io container-image build-ids, then for Ubuntu / Cloud Archive /
@@ -24,12 +40,18 @@ permissions:
   pull-requests: write
 
 jobs:
-  refresh:
+  generate:
     runs-on: ubuntu-24.04
     # Worst case: 15 missing versions * ~6 min/version = 90 min for the
     # generators alone, plus ~5 min for the host build + ~5 min for the
-    # final rebuild + PR open.  120 min leaves headroom for slow downloads.
+    # final rebuild.  120 min leaves headroom for slow downloads.
     timeout-minutes: 180
+    outputs:
+      count: ${{ steps.detect.outputs.count }}
+      succeeded: ${{ steps.generate.outputs.succeeded }}
+      # JSON array of generated versions, e.g. ["17.2.7","19.2.2"]; drives
+      # the verify job's dynamic matrix.  "[]" when nothing was generated.
+      versions: ${{ steps.generate.outputs.versions }}
 
     steps:
       - name: Checkout code and submodules
@@ -89,6 +111,12 @@ jobs:
             echo "succeeded=$S" >> "$GITHUB_OUTPUT"
             echo "failed=$F" >> "$GITHUB_OUTPUT"
 
+            # Emit the generated versions as a JSON array for the verify matrix.
+            versions=$(awk -F'\t' 'NF>=3{print $3}' /tmp/succeeded.tsv \
+                | python3 -c "import sys,json; print(json.dumps([l.strip() for l in sys.stdin if l.strip()]))")
+            echo "versions=$versions" >> "$GITHUB_OUTPUT"
+            echo "versions=$versions"
+
       - name: Re-aggregate embedded DWARF header + relink
         # This step proves the new JSONs parse cleanly through
         # tools/generate_embedded_dwarf.py and that osdtrace + radostrace
@@ -100,43 +128,240 @@ jobs:
             make clean
             make -j"$(nproc)" osdtrace radostrace
 
-      - name: Compose pull-request body
+      - name: Stage generated JSONs + manifest for downstream jobs
+        if: steps.generate.outputs.succeeded != '0'
+        run: |
+            mkdir -p /tmp/artifacts/jsons/osdtrace /tmp/artifacts/jsons/radostrace
+            while IFS=$'\t' read -r distro tools version pkgver; do
+                for f in "files/centos-stream/osdtrace/osd-${pkgver}_dwarf.json" \
+                         "files/centos-stream/radostrace/rados-${pkgver}_dwarf.json"; do
+                    [ -f "$f" ] && cp "$f" "/tmp/artifacts/jsons/${f#files/centos-stream/}"
+                done
+            done < /tmp/succeeded.tsv
+            mkdir -p /tmp/artifacts/manifest
+            cp /tmp/succeeded.tsv /tmp/failed.tsv /tmp/missing.tsv /tmp/artifacts/manifest/ 2>/dev/null || true
+            find /tmp/artifacts -type f | sort
+
+      - name: Upload generated JSONs
+        if: steps.generate.outputs.succeeded != '0'
+        uses: actions/upload-artifact@v4
+        with:
+          name: generated-jsons
+          path: /tmp/artifacts/jsons/
+          retention-days: 7
+
+      - name: Upload generation manifest
         if: steps.generate.outputs.succeeded != '0'
+        uses: actions/upload-artifact@v4
+        with:
+          name: gen-manifest
+          path: /tmp/artifacts/manifest/
+          retention-days: 7
+
+  verify:
+    needs: generate
+    # Skip entirely when nothing was generated (empty matrix is invalid).
+    if: ${{ needs.generate.outputs.versions != '' && needs.generate.outputs.versions != '[]' }}
+    runs-on: ubuntu-24.04
+    timeout-minutes: 45
+    strategy:
+      fail-fast: false
+      matrix:
+        version: ${{ fromJson(needs.generate.outputs.versions) }}
+    steps:
+      - name: Checkout code and submodules
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          fetch-depth: 0
+
+      - name: Install build dependencies
+        run: |
+            sudo apt-get update
+            sudo apt-get install -y g++ clang libelf-dev libc6-dev-i386 libdw-dev python3
+
+      - name: Pull in the freshly generated JSONs
+        uses: actions/download-artifact@v4
+        with:
+          name: generated-jsons
+          path: files/centos-stream
+
+      - name: Build osdtrace + radostrace with the new JSONs embedded
+        # Rebuilding here (rather than shipping a binary artifact) guarantees
+        # the embedded header is regenerated to include the just-generated
+        # version, and that the binary links against this runner's libs.
+        run: make -j"$(nproc)" osdtrace radostrace
+
+      - name: Runtime trace verification via cephadm (v${{ matrix.version }})
+        id: verify
+        # Pin the exact point-release image so the running ceph-osd / radosgw
+        # carry the same build_id as the el9 RPM the JSON was extracted from;
+        # REQUIRE_EMBEDDED=1 makes "Using embedded DWARF data" mandatory, so a
+        # silent fall-through to live DWARF parsing counts as a failure.
+        run: |
+            sudo CEPH_IMAGE="quay.io/ceph/ceph:v${{ matrix.version }}" \
+                 REQUIRE_EMBEDDED=1 \
+                 ./tests/functional-test-cephadm-rgw.sh "${{ matrix.version }}"
+
+      - name: Record verification result
+        if: always()
+        run: |
+            mkdir -p /tmp/vr
+            echo "${{ steps.verify.outcome }}" > "/tmp/vr/${{ matrix.version }}"
+            echo "version ${{ matrix.version }} -> ${{ steps.verify.outcome }}"
+
+      - name: Upload verification result
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: verify-result-${{ matrix.version }}
+          path: /tmp/vr/
+          retention-days: 7
+
+      - name: Stage trace logs for failure artifact
+        if: failure()
         run: |
+            mkdir -p trace-logs
+            for f in /tmp/osdtrace-cephadm-${{ matrix.version }}.log \
+                     /tmp/radostrace-cephadm-${{ matrix.version }}.log \
+                     /tmp/s3-workload-${{ matrix.version }}.log; do
+                [ -e "$f" ] && sudo cp "$f" trace-logs/ || true
+            done
+            sudo chown -R "$USER" trace-logs 2>/dev/null || true
+
+      - name: Upload trace logs
+        if: failure()
+        uses: actions/upload-artifact@v4
+        with:
+          name: trace-logs-${{ matrix.version }}
+          path: trace-logs/
+          retention-days: 14
+          if-no-files-found: ignore
+
+  open-pr:
+    needs: [generate, verify]
+    # Run even if some verify cells failed (we drop those versions); only
+    # skip when generation produced nothing.
+    if: ${{ always() && needs.generate.result == 'success' && needs.generate.outputs.succeeded != '0' }}
+    runs-on: ubuntu-24.04
+    timeout-minutes: 15
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Download generated JSONs
+        uses: actions/download-artifact@v4
+        with:
+          name: generated-jsons
+          path: /tmp/gen-jsons
+
+      - name: Download generation manifest
+        uses: actions/download-artifact@v4
+        with:
+          name: gen-manifest
+          path: /tmp/manifest
+
+      - name: Download verification results
+        uses: actions/download-artifact@v4
+        with:
+          pattern: verify-result-*
+          path: /tmp/verify
+          merge-multiple: true
+
+      - name: Assemble verified JSONs + compose PR body
+        id: assemble
+        run: |
+            # Map version -> success|failure from the verify result files.
+            declare -A status
+            if [ -d /tmp/verify ]; then
+                for f in /tmp/verify/*; do
+                    [ -f "$f" ] || continue
+                    status["$(basename "$f")"]="$(tr -d '[:space:]' < "$f")"
+                done
+            fi
+
+            : > /tmp/verified.tsv
+            : > /tmp/verifyfailed.tsv
+            copied=0
+            if [ -f /tmp/manifest/succeeded.tsv ]; then
+                while IFS=$'\t' read -r distro tools version pkgver; do
+                    [ -n "$version" ] || continue
+                    st="${status[$version]:-missing}"
+                    if [ "$st" = "success" ]; then
+                        for pair in "osdtrace/osd-${pkgver}_dwarf.json" \
+                                    "radostrace/rados-${pkgver}_dwarf.json"; do
+                            src="/tmp/gen-jsons/${pair}"
+                            dst="files/centos-stream/${pair}"
+                            if [ -f "$src" ]; then
+                                mkdir -p "$(dirname "$dst")"
+                                cp "$src" "$dst"
+                                copied=$((copied + 1))
+                            fi
+                        done
+                        printf '%s\t%s\t%s\t%s\n' "$distro" "$tools" "$version" "$pkgver" \
+                            >> /tmp/verified.tsv
+                    else
+                        printf '%s\t%s\t%s\t%s\t%s\n' "$distro" "$tools" "$version" "$pkgver" "$st" \
+                            >> /tmp/verifyfailed.tsv
+                        echo "::warning::runtime verification failed for $version (status=$st); dropping from PR"
+                    fi
+                done < /tmp/manifest/succeeded.tsv
+            fi
+            echo "copied=$copied" >> "$GITHUB_OUTPUT"
+            echo "Copied $copied verified JSON file(s) into the working tree."
+
             {
-                echo "## Newly added embedded DWARF JSONs"
+                echo "## Newly added embedded DWARF JSONs (runtime-verified)"
+                echo
+                echo "Each version below was traced against a live cephadm cluster"
+                echo "running \`quay.io/ceph/ceph:v<version>\`; both osdtrace and"
+                echo "radostrace loaded the **embedded** JSON (build_id match) and"
+                echo "produced valid trace output."
                 echo
                 echo "| distro | tools | version | pkgver |"
                 echo "|---|---|---|---|"
                 while IFS=$'\t' read -r d t v p; do
                     printf '| %s | %s | %s | `%s` |\n' "$d" "$t" "$v" "$p"
-                done < /tmp/succeeded.tsv
+                done < /tmp/verified.tsv
                 echo
-                if [ -s /tmp/failed.tsv ]; then
+                if [ -s /tmp/verifyfailed.tsv ]; then
+                    echo "## Generated but failed runtime verification"
+                    echo
+                    echo "Excluded from this PR; will be retried by the next run."
+                    echo
+                    echo '```'
+                    cat /tmp/verifyfailed.tsv
+                    echo '```'
+                    echo
+                fi
+                if [ -s /tmp/manifest/failed.tsv ]; then
                     echo "## Versions that failed to generate"
                     echo
                     echo "These will be retried by the next scheduled run."
                     echo
                     echo '```'
-                    cat /tmp/failed.tsv
+                    cat /tmp/manifest/failed.tsv
                     echo '```'
                     echo
                 fi
-                echo "## Verification"
+                echo "## How this was produced"
                 echo "- \`tools/detect_missing_dwarf.py\` identified the rows above"
                 echo "  by probing \`download.ceph.com/rpm-X.Y.Z/el9/x86_64/\`."
                 echo "- Each JSON was generated inside a disposable"
                 echo "  \`quay.io/centos/centos:stream9\` container with the"
                 echo "  matching ceph-osd + lib*-debuginfo packages installed."
-                echo "- \`make -j\` re-aggregated the headers and linked"
-                echo "  \`osdtrace\` + \`radostrace\` cleanly."
+                echo "- Each JSON was runtime-verified by"
+                echo "  \`tests/functional-test-cephadm-rgw.sh\` against a cephadm"
+                echo "  cluster of the matching version (embedded path enforced)."
                 echo
-                echo "_Generated by \`.github/workflows/refresh-embedded-dwarf.yaml\` ($(date -u +'%Y-%m-%d %H:%MZ'))._"
+                echo "_Generated by \`.github/workflows/refresh-embedded-dwarf.yaml\`._"
             } > /tmp/pr_body.md
             cat /tmp/pr_body.md
 
       - name: Open pull request
-        if: steps.generate.outputs.succeeded != '0'
+        if: steps.assemble.outputs.copied != '0'
         uses: peter-evans/create-pull-request@v6
         with:
           branch: chore/refresh-embedded-dwarf-${{ github.run_id }}
@@ -146,7 +371,9 @@ jobs:
             chore: refresh embedded DWARF for new Ceph point releases
 
             Auto-generated by the refresh-embedded-dwarf workflow.
-            See PR body for the list of versions added.
+            Each JSON was runtime-verified against a cephadm cluster of the
+            matching version via the embedded-DWARF path.  See PR body for
+            the list of versions added.
           body-path: /tmp/pr_body.md
           labels: |
             dwarf-refresh
@@ -154,3 +381,9 @@ jobs:
           add-paths: |
             files/centos-stream/osdtrace/*.json
             files/centos-stream/radostrace/*.json
+
+      - name: Note when nothing was verified
+        if: steps.assemble.outputs.copied == '0'
+        run: |
+            echo "No version passed runtime verification; no PR opened."
+            echo "See the 'verify' job matrix for per-version failures."
diff --git a/tests/functional-test-cephadm-rgw.sh b/tests/functional-test-cephadm-rgw.sh
index bea8a48..474cbe5 100755
--- a/tests/functional-test-cephadm-rgw.sh
+++ b/tests/functional-test-cephadm-rgw.sh
@@ -119,7 +119,16 @@ fi
 
 ############################################################################
 info "=== Step 2: install cephadm + resolve image for $CEPH_RELEASE ==="
-CEPH_IMG=$(cephadm_image_for_release "$CEPH_RELEASE")
+# CEPH_IMAGE lets a caller pin an exact point-release image
+# (e.g. quay.io/ceph/ceph:v19.2.2) instead of the per-major "latest"
+# default.  The embedded-DWARF refresh bot uses this to run the verification
+# against the precise version it just generated a JSON for, so the build_id
+# matches and the embedded path actually engages.
+if [ -n "${CEPH_IMAGE:-}" ]; then
+    CEPH_IMG="$CEPH_IMAGE"
+else
+    CEPH_IMG=$(cephadm_image_for_release "$CEPH_RELEASE")
+fi
 info "image: $CEPH_IMG"
 install_cephadm "$CEPH_RELEASE" /tmp/cephadm "$CEPH_IMG"
 
@@ -291,6 +300,40 @@ kill "$WL_PUT" "$WL_GET" 2>/dev/null || true
 sleep 2
 
 
+############################################################################
+info "=== Step 13b: check embedded-DWARF boot marker ==="
+# When REQUIRE_EMBEDDED=1 (the refresh bot's per-version verification), the
+# whole point is to prove the *embedded* JSON works: a fallback to live DWARF
+# parsing means the embedded data was never exercised, so treat it as a hard
+# failure.  Without the flag (the normal PR matrix) the marker is advisory --
+# the major-release "latest" image may legitimately predate or postdate the
+# embedded data, and live-parse fallback is an acceptable optimisation miss.
+check_embedded_marker() {
+    local tool="$1" log="$2"
+    if grep -q "Using embedded DWARF data" "$log"; then
+        info "✓ $tool used embedded DWARF data"
+        return 0
+    fi
+    if [ "${REQUIRE_EMBEDDED:-0}" = "1" ]; then
+        err "$tool did NOT use embedded DWARF data (REQUIRE_EMBEDDED=1)"
+        if grep -q "Start to parse dwarf info" "$log"; then
+            err "  -> it fell back to live DWARF parsing; the embedded JSON for this version was not matched (build_id mismatch?)"
+        else
+            err "  -> neither embedded nor live-parse marker present; tool may have failed to start"
+        fi
+        return 1
+    fi
+    if grep -q "Start to parse dwarf info" "$log"; then
+        info "[NOTE] $tool fell back to live DWARF parsing (embedded data not matched in this env)"
+    else
+        info "[NOTE] $tool: no embedded/live-parse marker found"
+    fi
+    return 0
+}
+check_embedded_marker osdtrace   "$OSDTRACE_LOG"
+check_embedded_marker radostrace "$RADOSTRACE_LOG"
+
+
 ############################################################################
 info "=== Step 14: gather cluster facts for verifiers ==="
 # osdtrace counts data rows targeting a single pool; pick the RGW data

From f9284a7ef605462908d091b47f7bdb18b914d278 Mon Sep 17 00:00:00 2001
From: Dongdong Tao <dongdong.tao@canonical.com>
Date: Sat, 6 Jun 2026 11:18:42 +0900
Subject: [PATCH 2/4] refresh-dwarf: ship generated JSONs as a tarball artifact

actions/upload-artifact@v4 rejects ':' in file paths, and the JSON
filenames embed the package epoch (osd-2:19.2.2-0.el9_dwarf.json), so the
raw-file upload failed.  Bundle the new JSONs into a colon-free tarball
(alongside the manifest TSVs) in generate, and untar in the verify and
open-pr jobs.  No other logic change.
---
 .github/workflows/refresh-embedded-dwarf.yaml | 57 +++++++++----------
 1 file changed, 28 insertions(+), 29 deletions(-)

diff --git a/.github/workflows/refresh-embedded-dwarf.yaml b/.github/workflows/refresh-embedded-dwarf.yaml
index 093ca46..5f05464 100644
--- a/.github/workflows/refresh-embedded-dwarf.yaml
+++ b/.github/workflows/refresh-embedded-dwarf.yaml
@@ -128,34 +128,31 @@ jobs:
             make clean
             make -j"$(nproc)" osdtrace radostrace
 
-      - name: Stage generated JSONs + manifest for downstream jobs
+      - name: Bundle generated JSONs + manifest for downstream jobs
         if: steps.generate.outputs.succeeded != '0'
         run: |
-            mkdir -p /tmp/artifacts/jsons/osdtrace /tmp/artifacts/jsons/radostrace
+            # The JSON filenames embed the package epoch (e.g.
+            # osd-2:19.2.2-0.el9_dwarf.json) and actions/upload-artifact@v4
+            # rejects the ':' character outright, so ship a colon-free tarball
+            # (plus the colon-free manifest TSVs) rather than the raw files.
+            mkdir -p /tmp/artifacts
+            : > /tmp/artifacts/filelist.txt
             while IFS=$'\t' read -r distro tools version pkgver; do
                 for f in "files/centos-stream/osdtrace/osd-${pkgver}_dwarf.json" \
                          "files/centos-stream/radostrace/rados-${pkgver}_dwarf.json"; do
-                    [ -f "$f" ] && cp "$f" "/tmp/artifacts/jsons/${f#files/centos-stream/}"
+                    [ -f "$f" ] && echo "$f" >> /tmp/artifacts/filelist.txt
                 done
             done < /tmp/succeeded.tsv
-            mkdir -p /tmp/artifacts/manifest
-            cp /tmp/succeeded.tsv /tmp/failed.tsv /tmp/missing.tsv /tmp/artifacts/manifest/ 2>/dev/null || true
-            find /tmp/artifacts -type f | sort
+            tar -czf /tmp/artifacts/generated-jsons.tar.gz -T /tmp/artifacts/filelist.txt
+            cp /tmp/succeeded.tsv /tmp/failed.tsv /tmp/missing.tsv /tmp/artifacts/ 2>/dev/null || true
+            echo "=== tarball contents ==="; tar -tzf /tmp/artifacts/generated-jsons.tar.gz
 
-      - name: Upload generated JSONs
+      - name: Upload generated JSONs + manifest
         if: steps.generate.outputs.succeeded != '0'
         uses: actions/upload-artifact@v4
         with:
           name: generated-jsons
-          path: /tmp/artifacts/jsons/
-          retention-days: 7
-
-      - name: Upload generation manifest
-        if: steps.generate.outputs.succeeded != '0'
-        uses: actions/upload-artifact@v4
-        with:
-          name: gen-manifest
-          path: /tmp/artifacts/manifest/
+          path: /tmp/artifacts/
           retention-days: 7
 
   verify:
@@ -184,7 +181,10 @@ jobs:
         uses: actions/download-artifact@v4
         with:
           name: generated-jsons
-          path: files/centos-stream
+          path: /tmp/gen
+
+      - name: Unpack generated JSONs into the tree
+        run: tar -xzf /tmp/gen/generated-jsons.tar.gz
 
       - name: Build osdtrace + radostrace with the new JSONs embedded
         # Rebuilding here (rather than shipping a binary artifact) guarantees
@@ -251,17 +251,16 @@ jobs:
         with:
           fetch-depth: 0
 
-      - name: Download generated JSONs
+      - name: Download generated JSONs + manifest
         uses: actions/download-artifact@v4
         with:
           name: generated-jsons
-          path: /tmp/gen-jsons
+          path: /tmp/gen
 
-      - name: Download generation manifest
-        uses: actions/download-artifact@v4
-        with:
-          name: gen-manifest
-          path: /tmp/manifest
+      - name: Unpack generated JSONs
+        run: |
+            mkdir -p /tmp/gen-extract
+            tar -xzf /tmp/gen/generated-jsons.tar.gz -C /tmp/gen-extract
 
       - name: Download verification results
         uses: actions/download-artifact@v4
@@ -285,14 +284,14 @@ jobs:
             : > /tmp/verified.tsv
             : > /tmp/verifyfailed.tsv
             copied=0
-            if [ -f /tmp/manifest/succeeded.tsv ]; then
+            if [ -f /tmp/gen/succeeded.tsv ]; then
                 while IFS=$'\t' read -r distro tools version pkgver; do
                     [ -n "$version" ] || continue
                     st="${status[$version]:-missing}"
                     if [ "$st" = "success" ]; then
                         for pair in "osdtrace/osd-${pkgver}_dwarf.json" \
                                     "radostrace/rados-${pkgver}_dwarf.json"; do
-                            src="/tmp/gen-jsons/${pair}"
+                            src="/tmp/gen-extract/files/centos-stream/${pair}"
                             dst="files/centos-stream/${pair}"
                             if [ -f "$src" ]; then
                                 mkdir -p "$(dirname "$dst")"
@@ -307,7 +306,7 @@ jobs:
                             >> /tmp/verifyfailed.tsv
                         echo "::warning::runtime verification failed for $version (status=$st); dropping from PR"
                     fi
-                done < /tmp/manifest/succeeded.tsv
+                done < /tmp/gen/succeeded.tsv
             fi
             echo "copied=$copied" >> "$GITHUB_OUTPUT"
             echo "Copied $copied verified JSON file(s) into the working tree."
@@ -336,13 +335,13 @@ jobs:
                     echo '```'
                     echo
                 fi
-                if [ -s /tmp/manifest/failed.tsv ]; then
+                if [ -s /tmp/gen/failed.tsv ]; then
                     echo "## Versions that failed to generate"
                     echo
                     echo "These will be retried by the next scheduled run."
                     echo
                     echo '```'
-                    cat /tmp/manifest/failed.tsv
+                    cat /tmp/gen/failed.tsv
                     echo '```'
                     echo
                 fi

From a80e1ed999e0a8cd938fd59e34b9a45829cbc83d Mon Sep 17 00:00:00 2001
From: Dongdong Tao <dongdong.tao@canonical.com>
Date: Sat, 6 Jun 2026 17:01:07 +0900
Subject: [PATCH 3/4] tests/cephadm: retry bootstrap to survive the
 orch-backend race

`cephadm bootstrap` intermittently aborts at its final step: it restarts
the mgr to load the cephadm module, then immediately runs
`orch set backend cephadm`. When the orchestrator module has not finished
loading yet, that command fails with
`Error ENOTSUP ... Module 'orchestrator' is not enabled/loaded` and the
whole bootstrap fails. This flaked the tentacle (v20.2.1) verify cells two
ways:

- ubuntu-24.04: newer cephadm auto-rolled back the partial cluster, so the
  FSID probe came up empty -> "bootstrap failed to produce FSID".
- ubuntu-22.04: the older apt-installed cephadm did NOT roll back, leaving
  a partial /var/lib/ceph/<fsid> with no orchestrator backend. The script
  proceeded, OSDs were never scheduled, and wait_cephadm_healthy burned the
  full 900s timeout.

Make cluster launch robust: retry the bootstrap up to
CEPHADM_BOOTSTRAP_ATTEMPTS times (default 3), purging any partial cluster
(rm-cluster + rm -rf) between attempts so stale mon/mgr containers don't
hold ports and the FSID probe can't latch onto a broken cluster id. On
total failure the helper returns non-zero and the caller fails fast instead
of timing out later against a backend-less cluster.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 tests/functional-test-cephadm-rgw.sh |  2 +-
 tests/lib/cephadm-setup.sh           | 79 ++++++++++++++++++++++++----
 2 files changed, 69 insertions(+), 12 deletions(-)

diff --git a/tests/functional-test-cephadm-rgw.sh b/tests/functional-test-cephadm-rgw.sh
index 474cbe5..514326c 100755
--- a/tests/functional-test-cephadm-rgw.sh
+++ b/tests/functional-test-cephadm-rgw.sh
@@ -144,7 +144,7 @@ info "OSD devices: ${OSD_DEVS[*]}"
 info "=== Step 4: bootstrap single-host cephadm cluster ==="
 MON_IP=$(hostname -I | awk '{print $1}')
 info "MON_IP=$MON_IP"
-FSID=$(cephadm_bootstrap_single_host "$CEPH_IMG" "$MON_IP" /tmp/cephadm)
+FSID=$(cephadm_bootstrap_single_host "$CEPH_IMG" "$MON_IP" /tmp/cephadm) || true
 [ -n "$FSID" ] || { err "bootstrap failed to produce FSID"; exit 1; }
 info "FSID=$FSID"
 
diff --git a/tests/lib/cephadm-setup.sh b/tests/lib/cephadm-setup.sh
index dfb9051..2a66497 100755
--- a/tests/lib/cephadm-setup.sh
+++ b/tests/lib/cephadm-setup.sh
@@ -115,14 +115,49 @@ provision_loopback_osds() {
 }
 
 
+# _purge_partial_clusters [cephadm_bin]
+#
+# Remove every cluster currently under /var/lib/ceph so the next bootstrap
+# attempt starts from a clean slate.  A failed `cephadm bootstrap` does NOT
+# reliably roll itself back: newer cephadm builds auto-delete on failure,
+# but the older apt-installed cephadm on Ubuntu 22.04 leaves the partial
+# /var/lib/ceph/<fsid> (plus its mon/mgr containers and systemd units)
+# behind.  Left in place that debris (a) makes the retry's port checks fail
+# because the old mon/mgr still hold the ports, and (b) fools the
+# `ls | head -1` FSID detection into echoing a stale, broken cluster id.
+_purge_partial_clusters() {
+    local cephadm_bin="${1:-cephadm}"
+    local fsid
+    for fsid in $(ls /var/lib/ceph/ 2>/dev/null | grep -E '^[0-9a-f]{8}-[0-9a-f-]+$'); do
+        info "purging leftover cluster $fsid before bootstrap"
+        "$cephadm_bin" rm-cluster --fsid "$fsid" --force --zap-osds >&2 2>/dev/null || true
+        rm -rf "/var/lib/ceph/$fsid" 2>/dev/null || true
+    done
+}
+
+
 # cephadm_bootstrap_single_host <image> <mon_ip> [cephadm_bin]
 #
 # Bootstrap the cluster and echo the new FSID.  --single-host-defaults
 # relaxes the no-single-host warnings; --skip-mon-network avoids requiring
 # a real network range; --allow-overwrite lets the test be idempotent
 # across retries on the same runner.
+#
+# Bootstrap is retried up to CEPHADM_BOOTSTRAP_ATTEMPTS times (default 3).
+# `cephadm bootstrap` has a known transient race near the end: it restarts
+# the mgr to load the cephadm module, then immediately runs
+# `orch set backend cephadm`.  If the orchestrator module has not finished
+# loading yet, that command fails with
+#   Error ENOTSUP ... Module 'orchestrator' is not enabled/loaded
+# and the whole bootstrap aborts.  It is not resumable, so the robust
+# response is to purge the half-built cluster and bootstrap again from
+# scratch.  Returns non-zero (echoing nothing) if every attempt fails, so
+# the caller fails fast instead of proceeding against a backend-less cluster
+# and timing out later while waiting for OSDs that can never be scheduled.
 cephadm_bootstrap_single_host() {
     local image="$1"; local mon_ip="$2"; local cephadm_bin="${3:-/tmp/cephadm}"
+    local max_attempts="${CEPHADM_BOOTSTRAP_ATTEMPTS:-3}"
+    local retry_delay="${CEPHADM_BOOTSTRAP_RETRY_DELAY:-15}"
     # --cluster-network is intentionally omitted: it requires a *network*
     # address (e.g. 10.0.0.0/24), not a host address, and the rejection
     # message is unhelpful ("has host bits set").  For a single-host cluster
@@ -135,17 +170,39 @@ cephadm_bootstrap_single_host() {
     # --no-cleanup-on-failure would let us inspect a partial bootstrap, but
     # it was added later in the quincy line and the apt-installed cephadm
     # on Ubuntu 22.04 rejects it.  CI doesn't need the inspection anyway —
-    # the default (auto-cleanup on failed bootstrap) is what we want there.
-    "$cephadm_bin" --image "$image" bootstrap \
-        --mon-ip "$mon_ip" \
-        --skip-mon-network \
-        --skip-firewalld \
-        --skip-dashboard \
-        --single-host-defaults \
-        --allow-overwrite \
-        --allow-mismatched-release \
-        >&2
-    ls /var/lib/ceph/ 2>/dev/null | grep -E '^[0-9a-f]{8}-[0-9a-f-]+$' | head -1
+    # we purge partial clusters ourselves between attempts (see below).
+    local attempt rc
+    for (( attempt=1; attempt<=max_attempts; attempt++ )); do
+        # Always start from a clean slate: clears any debris left by a
+        # previous failed attempt (or a stale cluster from an earlier run on
+        # the same self-hosted runner).
+        _purge_partial_clusters "$cephadm_bin"
+
+        rc=0
+        "$cephadm_bin" --image "$image" bootstrap \
+            --mon-ip "$mon_ip" \
+            --skip-mon-network \
+            --skip-firewalld \
+            --skip-dashboard \
+            --single-host-defaults \
+            --allow-overwrite \
+            --allow-mismatched-release \
+            >&2 || rc=$?
+
+        if [[ $rc -eq 0 ]]; then
+            ls /var/lib/ceph/ 2>/dev/null | grep -E '^[0-9a-f]{8}-[0-9a-f-]+$' | head -1
+            return 0
+        fi
+
+        err "cephadm bootstrap attempt ${attempt}/${max_attempts} failed (rc=$rc)"
+        if (( attempt < max_attempts )); then
+            info "retrying bootstrap in ${retry_delay}s ..."
+            sleep "$retry_delay"
+        fi
+    done
+
+    err "cephadm bootstrap failed after ${max_attempts} attempts"
+    return 1
 }
 
 

From e1f37971771830c21680a076a576e9804479a465 Mon Sep 17 00:00:00 2001
From: Dongdong Tao <dongdong.tao@canonical.com>
Date: Sat, 6 Jun 2026 17:29:09 +0900
Subject: [PATCH 4/4] tests/cephadm: judge bootstrap by orchestrator state, not
 exit code

The previous retry commit keyed success off the bootstrap exit code, which
broke the quincy cell (it had been passing). Two issues:

1. cephadm bootstrap's exit code conflates opposite outcomes. Bootstrapping
   an older image (quincy/reef) with a newer host cephadm tries to deploy
   services the image lacks -- `orch apply ceph-exporter` fails with EINVAL
   on quincy -- so bootstrap prints "Bootstrap complete." and leaves a fully
   working cluster, yet still exits non-zero (234). The old code ignored the
   exit code entirely (set -e is suppressed inside FSID=$(...)), so it passed
   then; my rc==0 gate wrongly rejected the good cluster and retried 3x.

   Now success is judged by the cluster's real state: an FSID exists AND the
   orchestrator backend is live (`ceph orch status` works). That accepts the
   benign quincy mismatch while still rejecting the tentacle orch-backend
   race (no backend -> retry), which is the property the downstream
   `ceph orch apply` for OSDs/RGW actually needs.

2. info() prints to stdout, and this helper's stdout is captured as the FSID
   by the caller. Any progress line (purge/retry/accept) leaked into the
   captured value -- e.g. "ERROR: not an fsid: INFO: retrying bootstrap...".
   Route every info() inside the capture to stderr; emit only the bare FSID
   on stdout.

Validated the stdout-only-FSID invariant locally across happy / quincy
(benign rc=234) / tentacle-retry / hard-fail scenarios.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 tests/lib/cephadm-setup.sh | 69 ++++++++++++++++++++++++++++----------
 1 file changed, 52 insertions(+), 17 deletions(-)

diff --git a/tests/lib/cephadm-setup.sh b/tests/lib/cephadm-setup.sh
index 2a66497..2591c47 100755
--- a/tests/lib/cephadm-setup.sh
+++ b/tests/lib/cephadm-setup.sh
@@ -129,13 +129,30 @@ _purge_partial_clusters() {
     local cephadm_bin="${1:-cephadm}"
     local fsid
     for fsid in $(ls /var/lib/ceph/ 2>/dev/null | grep -E '^[0-9a-f]{8}-[0-9a-f-]+$'); do
-        info "purging leftover cluster $fsid before bootstrap"
+        # NOTE: info() prints to stdout; this helper runs inside the
+        # FSID=$(cephadm_bootstrap_single_host ...) capture, so route every
+        # progress line to stderr or it pollutes the captured FSID.
+        info "purging leftover cluster $fsid before bootstrap" >&2
         "$cephadm_bin" rm-cluster --fsid "$fsid" --force --zap-osds >&2 2>/dev/null || true
         rm -rf "/var/lib/ceph/$fsid" 2>/dev/null || true
     done
 }
 
 
+# _orch_backend_ready <fsid>
+#
+# True iff the orchestrator backend is live in the freshly bootstrapped
+# cluster — i.e. `ceph orch status` works.  This is the property that
+# actually matters: the test next runs `ceph orch apply` for OSDs/RGW, which
+# needs a working cephadm backend.  We check this rather than the bootstrap
+# exit code because that code conflates two very different outcomes (see
+# cephadm_bootstrap_single_host).
+_orch_backend_ready() {
+    local fsid="$1"
+    cephadm shell --fsid "$fsid" -- ceph orch status >/dev/null 2>&1
+}
+
+
 # cephadm_bootstrap_single_host <image> <mon_ip> [cephadm_bin]
 #
 # Bootstrap the cluster and echo the new FSID.  --single-host-defaults
@@ -143,17 +160,28 @@ _purge_partial_clusters() {
 # a real network range; --allow-overwrite lets the test be idempotent
 # across retries on the same runner.
 #
-# Bootstrap is retried up to CEPHADM_BOOTSTRAP_ATTEMPTS times (default 3).
-# `cephadm bootstrap` has a known transient race near the end: it restarts
-# the mgr to load the cephadm module, then immediately runs
-# `orch set backend cephadm`.  If the orchestrator module has not finished
-# loading yet, that command fails with
-#   Error ENOTSUP ... Module 'orchestrator' is not enabled/loaded
-# and the whole bootstrap aborts.  It is not resumable, so the robust
-# response is to purge the half-built cluster and bootstrap again from
-# scratch.  Returns non-zero (echoing nothing) if every attempt fails, so
-# the caller fails fast instead of proceeding against a backend-less cluster
-# and timing out later while waiting for OSDs that can never be scheduled.
+# Bootstrap is retried up to CEPHADM_BOOTSTRAP_ATTEMPTS times (default 3),
+# and success is judged by the cluster's actual state, NOT by the bootstrap
+# exit code.  That distinction matters because the exit code conflates two
+# opposite outcomes:
+#
+#   * Transient, fatal (must retry): a race near the end of bootstrap — the
+#     mgr restarts to load the cephadm module, then `orch set backend
+#     cephadm` runs before the orchestrator module finishes loading and
+#     fails with `Error ENOTSUP ... Module 'orchestrator' is not
+#     enabled/loaded`.  The whole bootstrap aborts and leaves no working
+#     backend.  Seen on tentacle (v20.2.x).
+#   * Benign, non-fatal (must accept): bootstrapping an older image (quincy,
+#     reef) with a newer host cephadm tries to deploy services the image
+#     does not know — e.g. `orch apply ceph-exporter` fails with EINVAL on
+#     quincy.  cephadm logs it, prints "Bootstrap complete.", and leaves a
+#     fully functional cluster, yet still exits non-zero.
+#
+# So after each attempt we keep the cluster only if its orchestrator backend
+# is actually live (_orch_backend_ready); otherwise we purge it and retry.
+# Returns non-zero (echoing nothing) if every attempt fails, so the caller
+# fails fast instead of proceeding against a backend-less cluster and timing
+# out later while waiting for OSDs that can never be scheduled.
 cephadm_bootstrap_single_host() {
     local image="$1"; local mon_ip="$2"; local cephadm_bin="${3:-/tmp/cephadm}"
     local max_attempts="${CEPHADM_BOOTSTRAP_ATTEMPTS:-3}"
@@ -171,7 +199,7 @@ cephadm_bootstrap_single_host() {
     # it was added later in the quincy line and the apt-installed cephadm
     # on Ubuntu 22.04 rejects it.  CI doesn't need the inspection anyway —
     # we purge partial clusters ourselves between attempts (see below).
-    local attempt rc
+    local attempt rc fsid
     for (( attempt=1; attempt<=max_attempts; attempt++ )); do
         # Always start from a clean slate: clears any debris left by a
         # previous failed attempt (or a stale cluster from an earlier run on
@@ -189,14 +217,21 @@ cephadm_bootstrap_single_host() {
             --allow-mismatched-release \
             >&2 || rc=$?
 
-        if [[ $rc -eq 0 ]]; then
-            ls /var/lib/ceph/ 2>/dev/null | grep -E '^[0-9a-f]{8}-[0-9a-f-]+$' | head -1
+        fsid=$(ls /var/lib/ceph/ 2>/dev/null | grep -E '^[0-9a-f]{8}-[0-9a-f-]+$' | head -1)
+        if [[ -n "$fsid" ]] && _orch_backend_ready "$fsid"; then
+            # Cluster is up with a working orchestrator backend; a non-zero
+            # rc here is the benign service-apply mismatch described above.
+            # info() goes to stdout, which this function's caller captures as
+            # the FSID — keep all progress on stderr and emit ONLY the bare
+            # FSID on stdout.
+            [[ $rc -eq 0 ]] || info "bootstrap exited rc=$rc but orchestrator backend is live — accepting cluster $fsid" >&2
+            echo "$fsid"
             return 0
         fi
 
-        err "cephadm bootstrap attempt ${attempt}/${max_attempts} failed (rc=$rc)"
+        err "cephadm bootstrap attempt ${attempt}/${max_attempts} failed (rc=$rc, orchestrator backend not ready)"
         if (( attempt < max_attempts )); then
-            info "retrying bootstrap in ${retry_delay}s ..."
+            info "retrying bootstrap in ${retry_delay}s ..." >&2
             sleep "$retry_delay"
         fi
     done