Flakey test repro #4
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Manual stress-repro workflow for flakey integration tests. | |
| # | |
| # Runs one pytest session per shard against a list of test files, with | |
| # each test repeated --count times via pytest-repeat. Broker setup/teardown | |
| # is amortized across the whole session, so cross-test state accumulates | |
| # (topics, coordinator records, broker memory) the way it does in a normal | |
| # `make test` run. This is the "middle ground" configuration between raw | |
| # iteration count and shell-loop sequence fidelity; see task #12 discussion. | |
| # | |
| # Trigger from the Actions UI via "Run workflow" (workflow_dispatch only) — | |
| # does NOT run on PRs or pushes, to avoid burning runner minutes on every | |
| # commit. Default test_node stacks test_admin_integration.py and | |
| # test_consumer_group.py so test_heartbeat_thread runs against a broker | |
| # that has seen the same prior tests it would in `make test`. | |
| # | |
| name: Flakey test repro | |
| on: | |
| workflow_dispatch: | |
| inputs: | |
| test_node: | |
| description: "pytest node id(s) to stress (space-separated, word-split)" | |
| required: true | |
| default: "test/integration/test_admin_integration.py test/integration/test_consumer_group.py" | |
| count: | |
| description: "pytest-repeat --count (each test repeated this many times per session)" | |
| required: true | |
| default: "20" | |
| cpu_load: | |
| description: "stress-ng CPU workers running alongside (0 disables)" | |
| required: true | |
| default: "2" | |
| kafka_version: | |
| description: "Kafka broker version" | |
| required: true | |
| default: "4.2.0" | |
| python_version: | |
| description: "Python version" | |
| required: true | |
| default: "3.14" | |
| env: | |
| FORCE_COLOR: "1" | |
| PIP_DISABLE_PIP_VERSION_CHECK: "1" | |
| PIP_NO_PYTHON_VERSION_WARNING: "1" | |
| jobs: | |
| stress: | |
| runs-on: ubuntu-latest | |
| name: "Stress shard ${{ matrix.shard }}" | |
| timeout-minutes: 60 | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| # Four independent shards on four runners. Each shard runs the same | |
| # stress config; multiplying shards multiplies aggregate iterations | |
| # without serializing. A shard that catches the bug exits fast (-x); | |
| # other shards continue so we get as many traces as possible. | |
| shard: [1, 2, 3, 4] | |
| steps: | |
| - uses: actions/checkout@v6 | |
| - name: Set up Python ${{ github.event.inputs.python_version }} | |
| uses: actions/setup-python@v6 | |
| with: | |
| python-version: ${{ github.event.inputs.python_version }} | |
| cache: pip | |
| cache-dependency-path: | | |
| requirements-dev.txt | |
| - name: Install dependencies | |
| run: | | |
| sudo apt install -y libsnappy-dev libzstd-dev stress-ng | |
| python -m pip install --upgrade pip | |
| pip install -r requirements-dev.txt | |
| pip install pytest-repeat | |
| - name: Setup java | |
| uses: actions/setup-java@v5 | |
| with: | |
| distribution: temurin | |
| java-version: 23 | |
| - name: Restore cached kafka releases | |
| id: cache-servers-dist-restore | |
| uses: actions/cache/restore@v5 | |
| with: | |
| path: servers/dist | |
| key: servers-dist-${{ github.event.inputs.kafka_version }} | |
| - name: Install Kafka release | |
| run: make servers/${{ github.event.inputs.kafka_version }}/kafka-bin | |
| - name: Update kafka release cache | |
| uses: actions/cache/save@v5 | |
| with: | |
| path: servers/dist | |
| key: ${{ steps.cache-servers-dist-restore.outputs.cache-primary-key }} | |
| - name: Start CPU load | |
| if: github.event.inputs.cpu_load != '0' | |
| run: | | |
| # Background stress-ng; capture PID so we can stop it in the | |
| # cleanup step regardless of whether pytest passed or failed. | |
| stress-ng --cpu ${{ github.event.inputs.cpu_load }} \ | |
| --timeout 3600s \ | |
| --metrics-brief & | |
| echo "STRESS_PID=$!" >> "$GITHUB_ENV" | |
| - name: Run stress test | |
| id: pytest | |
| run: | | |
| # Single pytest session, one broker setup/teardown amortized | |
| # across the whole run. pytest-repeat --count=N parametrizes | |
| # each test N times (grouped by test, in source order within | |
| # a file) so cross-test state accumulates on the broker the | |
| # way it does in `make test`. No xdist: serialized execution | |
| # preserves ordering and keeps the test-to-broker-state | |
| # relationship deterministic. | |
| # | |
| # -x stops on first failure so a shard that catches the bug | |
| # exits fast and uploads its pytest log. --log-cli-level=DEBUG | |
| # surfaces the coordinator debug statements from task #12. | |
| # | |
| # test_node is deliberately unquoted so space-separated input | |
| # like "file_a.py file_b.py" word-splits into separate pytest | |
| # arguments. | |
| pytest ${{ github.event.inputs.test_node }} \ | |
| --count=${{ github.event.inputs.count }} \ | |
| --timeout=120 \ | |
| -x \ | |
| -v \ | |
| --log-cli-level=DEBUG \ | |
| 2>&1 | tee pytest.log | |
| env: | |
| KAFKA_VERSION: ${{ github.event.inputs.kafka_version }} | |
| - name: Stop CPU load | |
| if: always() && env.STRESS_PID != '' | |
| run: kill "$STRESS_PID" || true | |
| - name: Upload logs on failure | |
| if: failure() | |
| uses: actions/upload-artifact@v7 | |
| with: | |
| name: repro-logs-shard-${{ matrix.shard }} | |
| path: | | |
| pytest.log | |
| if-no-files-found: warn |