Skip to content

Flakey test repro

Flakey test repro #4

# Manual stress-repro workflow for flakey integration tests.
#
# Runs one pytest session per shard against a list of test files, with
# each test repeated --count times via pytest-repeat. Broker setup/teardown
# is amortized across the whole session, so cross-test state accumulates
# (topics, coordinator records, broker memory) the way it does in a normal
# `make test` run. This is the "middle ground" configuration between raw
# iteration count and shell-loop sequence fidelity; see task #12 discussion.
#
# Trigger from the Actions UI via "Run workflow" (workflow_dispatch only) —
# does NOT run on PRs or pushes, to avoid burning runner minutes on every
# commit. Default test_node stacks test_admin_integration.py and
# test_consumer_group.py so test_heartbeat_thread runs against a broker
# that has seen the same prior tests it would in `make test`.
#
name: Flakey test repro
on:
workflow_dispatch:
inputs:
test_node:
description: "pytest node id(s) to stress (space-separated, word-split)"
required: true
default: "test/integration/test_admin_integration.py test/integration/test_consumer_group.py"
count:
description: "pytest-repeat --count (each test repeated this many times per session)"
required: true
default: "20"
cpu_load:
description: "stress-ng CPU workers running alongside (0 disables)"
required: true
default: "2"
kafka_version:
description: "Kafka broker version"
required: true
default: "4.2.0"
python_version:
description: "Python version"
required: true
default: "3.14"
env:
FORCE_COLOR: "1"
PIP_DISABLE_PIP_VERSION_CHECK: "1"
PIP_NO_PYTHON_VERSION_WARNING: "1"
jobs:
stress:
runs-on: ubuntu-latest
name: "Stress shard ${{ matrix.shard }}"
timeout-minutes: 60
strategy:
fail-fast: false
matrix:
# Four independent shards on four runners. Each shard runs the same
# stress config; multiplying shards multiplies aggregate iterations
# without serializing. A shard that catches the bug exits fast (-x);
# other shards continue so we get as many traces as possible.
shard: [1, 2, 3, 4]
steps:
- uses: actions/checkout@v6
- name: Set up Python ${{ github.event.inputs.python_version }}
uses: actions/setup-python@v6
with:
python-version: ${{ github.event.inputs.python_version }}
cache: pip
cache-dependency-path: |
requirements-dev.txt
- name: Install dependencies
run: |
sudo apt install -y libsnappy-dev libzstd-dev stress-ng
python -m pip install --upgrade pip
pip install -r requirements-dev.txt
pip install pytest-repeat
- name: Setup java
uses: actions/setup-java@v5
with:
distribution: temurin
java-version: 23
- name: Restore cached kafka releases
id: cache-servers-dist-restore
uses: actions/cache/restore@v5
with:
path: servers/dist
key: servers-dist-${{ github.event.inputs.kafka_version }}
- name: Install Kafka release
run: make servers/${{ github.event.inputs.kafka_version }}/kafka-bin
- name: Update kafka release cache
uses: actions/cache/save@v5
with:
path: servers/dist
key: ${{ steps.cache-servers-dist-restore.outputs.cache-primary-key }}
- name: Start CPU load
if: github.event.inputs.cpu_load != '0'
run: |
# Background stress-ng; capture PID so we can stop it in the
# cleanup step regardless of whether pytest passed or failed.
stress-ng --cpu ${{ github.event.inputs.cpu_load }} \
--timeout 3600s \
--metrics-brief &
echo "STRESS_PID=$!" >> "$GITHUB_ENV"
- name: Run stress test
id: pytest
run: |
# Single pytest session, one broker setup/teardown amortized
# across the whole run. pytest-repeat --count=N parametrizes
# each test N times (grouped by test, in source order within
# a file) so cross-test state accumulates on the broker the
# way it does in `make test`. No xdist: serialized execution
# preserves ordering and keeps the test-to-broker-state
# relationship deterministic.
#
# -x stops on first failure so a shard that catches the bug
# exits fast and uploads its pytest log. --log-cli-level=DEBUG
# surfaces the coordinator debug statements from task #12.
#
# test_node is deliberately unquoted so space-separated input
# like "file_a.py file_b.py" word-splits into separate pytest
# arguments.
pytest ${{ github.event.inputs.test_node }} \
--count=${{ github.event.inputs.count }} \
--timeout=120 \
-x \
-v \
--log-cli-level=DEBUG \
2>&1 | tee pytest.log
env:
KAFKA_VERSION: ${{ github.event.inputs.kafka_version }}
- name: Stop CPU load
if: always() && env.STRESS_PID != ''
run: kill "$STRESS_PID" || true
- name: Upload logs on failure
if: failure()
uses: actions/upload-artifact@v7
with:
name: repro-logs-shard-${{ matrix.shard }}
path: |
pytest.log
if-no-files-found: warn