Skip to content

Commit cd15ddd

Browse files
committed
flakey-test-repro workflow
1 parent 300033e commit cd15ddd

1 file changed

Lines changed: 139 additions & 0 deletions

File tree

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
# Manual stress-repro workflow for flakey integration tests.
2+
#
3+
# Run a single pytest node id many times, in parallel, under CPU pressure,
4+
# to maximize the chance of catching timing-sensitive flakes. Uploads the
5+
# pytest log from any shard that catches a failure.
6+
#
7+
# Trigger from the Actions UI via "Run workflow" (workflow_dispatch only) —
8+
# does NOT run on PRs or pushes, to avoid burning runner minutes on every
9+
# commit. Defaults target the test_heartbeat_thread rejoin hang, but any
10+
# pytest node id can be supplied.
11+
#
12+
name: Flakey test repro
13+
14+
on:
15+
workflow_dispatch:
16+
inputs:
17+
test_node:
18+
description: "pytest node id to stress"
19+
required: true
20+
default: "test/integration/test_consumer_group.py::test_heartbeat_thread"
21+
count:
22+
description: "pytest-repeat --count per shard"
23+
required: true
24+
default: "100"
25+
parallelism:
26+
description: "pytest-xdist workers per shard"
27+
required: true
28+
default: "4"
29+
cpu_load:
30+
description: "stress-ng CPU workers running alongside (0 disables)"
31+
required: true
32+
default: "2"
33+
kafka_version:
34+
description: "Kafka broker version"
35+
required: true
36+
default: "4.2.0"
37+
python_version:
38+
description: "Python version"
39+
required: true
40+
default: "3.14"
41+
42+
env:
43+
FORCE_COLOR: "1"
44+
PIP_DISABLE_PIP_VERSION_CHECK: "1"
45+
PIP_NO_PYTHON_VERSION_WARNING: "1"
46+
47+
jobs:
48+
stress:
49+
runs-on: ubuntu-latest
50+
name: "Stress shard ${{ matrix.shard }}"
51+
timeout-minutes: 60
52+
strategy:
53+
fail-fast: false
54+
matrix:
55+
# Four independent shards on four runners. Each shard runs the same
56+
# stress config; multiplying shards multiplies aggregate iterations
57+
# without serializing. A shard that catches the bug exits fast (-x);
58+
# other shards continue so we get as many traces as possible.
59+
shard: [1, 2, 3, 4]
60+
61+
steps:
62+
- uses: actions/checkout@v6
63+
64+
- name: Set up Python ${{ github.event.inputs.python_version }}
65+
uses: actions/setup-python@v6
66+
with:
67+
python-version: ${{ github.event.inputs.python_version }}
68+
cache: pip
69+
cache-dependency-path: |
70+
requirements-dev.txt
71+
72+
- name: Install dependencies
73+
run: |
74+
sudo apt install -y libsnappy-dev libzstd-dev stress-ng
75+
python -m pip install --upgrade pip
76+
pip install -r requirements-dev.txt
77+
pip install pytest-repeat pytest-xdist
78+
79+
- name: Setup java
80+
uses: actions/setup-java@v5
81+
with:
82+
distribution: temurin
83+
java-version: 23
84+
85+
- name: Restore cached kafka releases
86+
id: cache-servers-dist-restore
87+
uses: actions/cache/restore@v5
88+
with:
89+
path: servers/dist
90+
key: servers-dist-${{ github.event.inputs.kafka_version }}
91+
92+
- name: Install Kafka release
93+
run: make servers/${{ github.event.inputs.kafka_version }}/kafka-bin
94+
95+
- name: Update kafka release cache
96+
uses: actions/cache/save@v5
97+
with:
98+
path: servers/dist
99+
key: ${{ steps.cache-servers-dist-restore.outputs.cache-primary-key }}
100+
101+
- name: Start CPU load
102+
if: github.event.inputs.cpu_load != '0'
103+
run: |
104+
# Background stress-ng; capture PID so we can stop it in the
105+
# cleanup step regardless of whether pytest passed or failed.
106+
stress-ng --cpu ${{ github.event.inputs.cpu_load }} \
107+
--timeout 3600s \
108+
--metrics-brief &
109+
echo "STRESS_PID=$!" >> "$GITHUB_ENV"
110+
111+
- name: Run stress test
112+
id: pytest
113+
run: |
114+
# -x stops on first failure so a shard that catches the bug exits
115+
# fast and uploads logs. --log-cli-level=DEBUG surfaces the debug
116+
# statements already in the coordinator for task #12.
117+
pytest "${{ github.event.inputs.test_node }}" \
118+
--count=${{ github.event.inputs.count }} \
119+
-n ${{ github.event.inputs.parallelism }} \
120+
--timeout=120 \
121+
-x \
122+
-v \
123+
--log-cli-level=DEBUG \
124+
2>&1 | tee pytest.log
125+
env:
126+
KAFKA_VERSION: ${{ github.event.inputs.kafka_version }}
127+
128+
- name: Stop CPU load
129+
if: always() && env.STRESS_PID != ''
130+
run: kill "$STRESS_PID" || true
131+
132+
- name: Upload logs on failure
133+
if: failure()
134+
uses: actions/upload-artifact@v7
135+
with:
136+
name: repro-logs-shard-${{ matrix.shard }}
137+
path: |
138+
pytest.log
139+
if-no-files-found: warn

0 commit comments

Comments
 (0)