From 4170e4368e1d639cdf257f81734eae16aa66caf6 Mon Sep 17 00:00:00 2001
From: Mitch Wagner <mitch.wagner@antithesis.com>
Date: Wed, 6 May 2026 15:25:19 -0400
Subject: [PATCH 01/65] feat: working local antithesis build

---
 antithesis/AGENTS.md                          |  15 +
 antithesis/Makefile                           | 104 ++++++
 antithesis/config/docker-compose.yaml         | 320 ++++++++++++++++++
 antithesis/scratchbook/bug-candidates.md      | 161 +++++++++
 antithesis/scratchbook/deployment-topology.md | 157 +++++++++
 antithesis/scratchbook/existing-assertions.md |  37 ++
 .../catalog-recovery-consistency.md           |  33 ++
 .../properties/command-channel-ordering.md    |  28 ++
 .../compute-replica-epoch-isolation.md        |  25 ++
 .../critical-reader-fence-linearization.md    |  24 ++
 .../properties/deployment-lag-detection.md    |  26 ++
 .../properties/deployment-promotion-safety.md |  26 ++
 .../epoch-fencing-prevents-split-brain.md     |  35 ++
 .../properties/fault-recovery-exercised.md    |  28 ++
 .../properties/group-commit-toctou-safety.md  |  28 ++
 .../idempotent-write-under-indeterminate.md   |  28 ++
 .../properties/mv-reflects-source-updates.md  |  32 ++
 .../properties/peek-lifecycle-exactly-once.md |  35 ++
 .../properties/persist-cas-monotonicity.md    |  34 ++
 .../properties/source-ingestion-progress.md   |  27 ++
 .../storage-command-replay-idempotent.md      |  28 ++
 .../properties/strict-serializable-reads.md   |  34 ++
 .../properties/tombstone-sealing-finality.md  |  22 ++
 antithesis/scratchbook/property-catalog.md    | 217 ++++++++++++
 .../scratchbook/property-relationships.md     |  56 +++
 antithesis/scratchbook/sut-analysis.md        | 217 ++++++++++++
 test/antithesis/export-compose.py             |  58 ++++
 test/antithesis/mzcompose.py                  |  88 +++++
 test/antithesis/workload/Dockerfile           |  34 ++
 test/antithesis/workload/mzbuild.yml          |   1 +
 test/antithesis/workload/setup-complete.sh    |  22 ++
 .../workload/test/anytime_health_check.sh     |  19 ++
 .../workload/workload-entrypoint.sh           |  16 +
 33 files changed, 2015 insertions(+)
 create mode 100644 antithesis/AGENTS.md
 create mode 100644 antithesis/Makefile
 create mode 100644 antithesis/config/docker-compose.yaml
 create mode 100644 antithesis/scratchbook/bug-candidates.md
 create mode 100644 antithesis/scratchbook/deployment-topology.md
 create mode 100644 antithesis/scratchbook/existing-assertions.md
 create mode 100644 antithesis/scratchbook/properties/catalog-recovery-consistency.md
 create mode 100644 antithesis/scratchbook/properties/command-channel-ordering.md
 create mode 100644 antithesis/scratchbook/properties/compute-replica-epoch-isolation.md
 create mode 100644 antithesis/scratchbook/properties/critical-reader-fence-linearization.md
 create mode 100644 antithesis/scratchbook/properties/deployment-lag-detection.md
 create mode 100644 antithesis/scratchbook/properties/deployment-promotion-safety.md
 create mode 100644 antithesis/scratchbook/properties/epoch-fencing-prevents-split-brain.md
 create mode 100644 antithesis/scratchbook/properties/fault-recovery-exercised.md
 create mode 100644 antithesis/scratchbook/properties/group-commit-toctou-safety.md
 create mode 100644 antithesis/scratchbook/properties/idempotent-write-under-indeterminate.md
 create mode 100644 antithesis/scratchbook/properties/mv-reflects-source-updates.md
 create mode 100644 antithesis/scratchbook/properties/peek-lifecycle-exactly-once.md
 create mode 100644 antithesis/scratchbook/properties/persist-cas-monotonicity.md
 create mode 100644 antithesis/scratchbook/properties/source-ingestion-progress.md
 create mode 100644 antithesis/scratchbook/properties/storage-command-replay-idempotent.md
 create mode 100644 antithesis/scratchbook/properties/strict-serializable-reads.md
 create mode 100644 antithesis/scratchbook/properties/tombstone-sealing-finality.md
 create mode 100644 antithesis/scratchbook/property-catalog.md
 create mode 100644 antithesis/scratchbook/property-relationships.md
 create mode 100644 antithesis/scratchbook/sut-analysis.md
 create mode 100644 test/antithesis/export-compose.py
 create mode 100644 test/antithesis/mzcompose.py
 create mode 100644 test/antithesis/workload/Dockerfile
 create mode 100644 test/antithesis/workload/mzbuild.yml
 create mode 100755 test/antithesis/workload/setup-complete.sh
 create mode 100755 test/antithesis/workload/test/anytime_health_check.sh
 create mode 100755 test/antithesis/workload/workload-entrypoint.sh

diff --git a/antithesis/AGENTS.md b/antithesis/AGENTS.md
new file mode 100644
index 0000000000000..ff80e8994fb67
--- /dev/null
+++ b/antithesis/AGENTS.md
@@ -0,0 +1,15 @@
+This directory contains files relevant to running tests in Antithesis.
+
+Use the `antithesis-setup` skill to scaffold and manage this directory. Use the `antithesis-research` skill to analyze the system and build a property catalog. Use the `antithesis-workload` skill to implement assertions and test commands.
+
+**setup-complete.sh**
+Inject this script into a Dockerfile to notify Antithesis that setup is complete. This script should only run once the system under test is ready for testing. Antithesis will not run any test commands until it receives this event.
+
+**config**
+This directory contains the `docker-compose.yaml` file used to bring up this system within the Antithesis environment, along with any closely related config files.
+
+**scratchbook**
+This directory is the Antithesis scratchbook for the codebase. It contains documents such as system analysis, property catalogs, topology plans, per-property evidence files (in `scratchbook/properties/`), property relationship maps, and other persistent integration notes. Keep it up to date as Antithesis-related decisions change.
+
+**test**
+This directory contains test templates. A test template is a directory containing test command executable files. Each test command must have a valid prefix: `parallel_driver_, singleton_driver_, serial_driver_, first_, eventually_, finally_, anytime_`. Prefixes constrain when and how commands are composed in a single timeline. Files or subdirectories prefixed with `helper_` are ignored by Test Composer and can be used for helper scripts kept alongside the commands.
diff --git a/antithesis/Makefile b/antithesis/Makefile
new file mode 100644
index 0000000000000..d29e795d22be7
--- /dev/null
+++ b/antithesis/Makefile
@@ -0,0 +1,104 @@
+# Build / run helper for the Materialize Antithesis harness.
+#
+# Usage:
+#   make build                 # build every local image
+#   make up                    # export compose, build, bring up the stack
+#   make test                  # smoke test against the running cluster
+#   make push                  # push locally-built images to Antithesis registry
+#   make down                  # tear down (preserves volumes)
+#   make clean                 # tear down + remove volumes + images
+#   make smoke                 # full cycle: build → up → test
+
+SHELL := /usr/bin/env bash
+.SHELLFLAGS := -eu -o pipefail -c
+
+PROJECT   := materialize
+REPO_ROOT := $(realpath $(dir $(lastword $(MAKEFILE_LIST)))/..)
+
+ifndef RUNTIME
+  RUNTIME := $(shell command -v podman >/dev/null 2>&1 && echo podman || (command -v docker >/dev/null 2>&1 && echo docker || echo none))
+endif
+ifeq ($(RUNTIME),none)
+  $(error neither podman nor docker found in PATH; set RUNTIME=docker or install podman)
+endif
+
+COMPOSE := $(RUNTIME) compose -p $(PROJECT) -f config/docker-compose.yaml
+PSQL    := $(COMPOSE) exec materialized psql -h localhost -p 6875 -U materialize
+
+REGISTRY      ?= us-central1-docker.pkg.dev
+REGISTRY_PATH ?= /molten-verve-216720/materialize-repository
+
+# ---------------------------------------------------------------------------
+# Export — generate the resolved docker-compose YAML for Antithesis.
+# ---------------------------------------------------------------------------
+.PHONY: export-compose
+export-compose:
+	cd $(REPO_ROOT) && $(REPO_ROOT)/bin/pyactivate test/antithesis/export-compose.py > antithesis/config/docker-compose.yaml
+	@echo "Wrote config/docker-compose.yaml"
+
+# ---------------------------------------------------------------------------
+# Build — build images that don't have public equivalents.
+# ---------------------------------------------------------------------------
+LOCAL_IMAGES  := workload
+BUILD_TARGETS := $(LOCAL_IMAGES:%=build-%)
+
+.PHONY: build $(BUILD_TARGETS)
+build: $(BUILD_TARGETS)
+
+$(BUILD_TARGETS): build-%:
+	$(RUNTIME) build \
+	  --platform linux/amd64 \
+	  -t $(PROJECT)-$*:latest \
+	  $(REPO_ROOT)/test/antithesis/$*
+
+# ---------------------------------------------------------------------------
+# Up / Down
+# ---------------------------------------------------------------------------
+.PHONY: up
+up: export-compose build
+	$(COMPOSE) up -d
+
+.PHONY: down
+down:
+	$(COMPOSE) down
+
+# ---------------------------------------------------------------------------
+# Test — quick smoke test against the running cluster
+# ---------------------------------------------------------------------------
+.PHONY: test
+test:
+	$(PSQL) -c "CREATE TABLE IF NOT EXISTS smoke_test (k INT, v TEXT)"
+	$(PSQL) -c "INSERT INTO smoke_test VALUES (1, 'hello'), (2, 'world')"
+	$(PSQL) -c "SELECT * FROM smoke_test ORDER BY k"
+	$(PSQL) -c "DROP TABLE smoke_test"
+
+# ---------------------------------------------------------------------------
+# Push — tag local images and push to the Antithesis registry
+# ---------------------------------------------------------------------------
+.PHONY: push
+push:
+	@$(RUNTIME) images --format '{{.Repository}}:{{.Tag}}' \
+	  | grep ':latest$$' | grep '^\(localhost/\)\?$(PROJECT)-' \
+	  | while read item; do \
+	      nametag="$${item#localhost/}"; \
+	      name="$${nametag%:*}"; \
+	      remote="$(REGISTRY)$(REGISTRY_PATH)/$${name}:latest"; \
+	      echo "Pushing $${item} -> $${remote}"; \
+	      $(RUNTIME) tag "$${item}" "$${remote}" || exit 1; \
+	      $(RUNTIME) push "$${remote}" || exit 1; \
+	  done
+
+# ---------------------------------------------------------------------------
+# Clean
+# ---------------------------------------------------------------------------
+.PHONY: clean
+clean: down
+	$(COMPOSE) down -v --remove-orphans 2>/dev/null || true
+	-$(RUNTIME) rmi $$($(RUNTIME) images --format '{{.Repository}}:{{.Tag}}' | grep '^$(PROJECT)-' || true) 2>/dev/null
+
+# ---------------------------------------------------------------------------
+# Smoke — full cycle: build → up → test
+# ---------------------------------------------------------------------------
+.PHONY: smoke
+smoke: up test
+	@echo "[smoke] passed"
diff --git a/antithesis/config/docker-compose.yaml b/antithesis/config/docker-compose.yaml
new file mode 100644
index 0000000000000..6eb68d6f7e789
--- /dev/null
+++ b/antithesis/config/docker-compose.yaml
@@ -0,0 +1,320 @@
+services:
+  postgres-metadata:
+    command:
+    - postgres
+    - -c
+    - wal_level=logical
+    - -c
+    - max_wal_senders=100
+    - -c
+    - max_replication_slots=100
+    - -c
+    - max_connections=5000
+    ports:
+    - '26257'
+    environment:
+    - POSTGRESDB=postgres
+    - POSTGRES_PASSWORD=postgres
+    - LD_PRELOAD=libeatmydata.so
+    - PGPORT=26257
+    - POSTGRES_HOST_AUTH_METHOD=trust
+    healthcheck:
+      test:
+      - CMD
+      - pg_isready
+      - -U
+      - postgres
+      interval: 1s
+      start_period: 30s
+    restart: 'no'
+    volumes:
+    - ../../misc/postgres/setup_materialize.sql:/docker-entrypoint-initdb.d/z_setup_materialize.sql
+    platform: linux/amd64
+    image: postgres:17.7
+  minio:
+    entrypoint:
+    - sh
+    - -c
+    command:
+    - mkdir -p /data/persist && minio server /data --console-address :9001
+    ports:
+    - 9000
+    - 9001
+    environment:
+    - MINIO_STORAGE_CLASS_STANDARD=EC:0
+    - MINIO_HEAL_DISABLE=on
+    - MINIO_DISK_WATERMARK_LOW=1
+    - MINIO_DISK_WATERMARK_HIGH=1
+    healthcheck:
+      test:
+      - CMD
+      - curl
+      - --fail
+      - http://localhost:9000/minio/health/live
+      timeout: 5s
+      interval: 1s
+      start_period: 30s
+    platform: linux/amd64
+    image: minio/minio:latest
+  redpanda:
+    image: redpandadata/redpanda:v25.2.11
+    ports:
+    - 9092
+    - 8081
+    command:
+    - redpanda
+    - start
+    - --overprovisioned
+    - --smp=1
+    - --memory=1G
+    - --reserve-memory=0M
+    - --node-id=0
+    - --check=false
+    - --set
+    - redpanda.enable_transactions=true
+    - --set
+    - redpanda.enable_idempotence=true
+    - --set
+    - redpanda.auto_create_topics_enabled=True
+    - --set
+    - redpanda.topic_memory_per_partition=4096
+    - --set
+    - --advertise-kafka-addr=kafka:9092
+    networks:
+      default:
+        aliases:
+        - kafka
+        - schema-registry
+    healthcheck:
+      test:
+      - CMD
+      - curl
+      - -f
+      - localhost:9644/v1/status/ready
+      interval: 1s
+      start_period: 120s
+    platform: linux/amd64
+  materialized:
+    hostname: materialized
+    depends_on:
+      minio:
+        condition: service_started
+      postgres-metadata:
+        condition: service_healthy
+    command:
+    - --unsafe-mode
+    - --environment-id=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0
+    - --persist-blob-url=s3://minioadmin:minioadmin@persist/persist?endpoint=http://minio:9000/&region=minio
+    - --orchestrator-process-propagate-crashes
+    - --persist-consensus-url=postgres://root@postgres-metadata:26257?options=--search_path=consensus
+    - --orchestrator-process-tcp-proxy-listen-addr=0.0.0.0
+    - --orchestrator-process-prometheus-service-discovery-directory=/mzdata/prometheus
+    ports:
+    - 6875
+    - 6876
+    - 6877
+    - 6878
+    - 6880
+    - 6881
+    - 26257
+    environment:
+    - MZ_NO_TELEMETRY=1
+    - MZ_NO_BUILTIN_CONSOLE=1
+    - MZ_EAT_MY_DATA=1
+    - MZ_TEST_ONLY_DUMMY_SEGMENT_CLIENT=true
+    - MZ_SOFT_ASSERTIONS=1
+    - MZ_ORCHESTRATOR_PROCESS_TCP_PROXY_LISTEN_ADDR=0.0.0.0
+    - MZ_ORCHESTRATOR_PROCESS_PROMETHEUS_SERVICE_DISCOVERY_DIRECTORY=/mzdata/prometheus
+    - MZ_BOOTSTRAP_ROLE=materialize
+    - MZ_INTERNAL_PERSIST_PUBSUB_LISTEN_ADDR=0.0.0.0:6879
+    - MZ_PERSIST_PUBSUB_URL=http://127.0.0.1:6879
+    - MZ_AWS_CONNECTION_ROLE_ARN=arn:aws:iam::123456789000:role/MaterializeConnection
+    - MZ_EXTERNAL_LOGIN_PASSWORD_MZ_SYSTEM=password
+    - MZ_AWS_EXTERNAL_ID_PREFIX=eb5cb59b-e2fe-41f3-87ca-d2176a495345
+    - MZ_CATALOG_STORE=persist
+    - MZ_LOG_FILTER
+    - CLUSTERD_LOG_FILTER
+    - 'MZ_CLUSTER_REPLICA_SIZES={"bootstrap": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "1", "disabled": false, "disk_limit": null, "is_cc":
+      true, "memory_limit": "4 GiB", "scale": 1, "workers": 1}, "scale=2,workers=4":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "8", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale":
+      2, "workers": 4}, "scale=1,workers=1,legacy": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "1", "disabled": false, "disk_limit": null, "is_cc":
+      false, "memory_limit": "4 GiB", "scale": 1, "workers": 1}, "scale=1,workers=2,legacy":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "2", "disabled":
+      false, "disk_limit": null, "is_cc": false, "memory_limit": "4 GiB", "scale":
+      1, "workers": 2}, "free": {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour":
+      "1", "disabled": true, "disk_limit": null, "is_cc": true, "memory_limit": "4
+      GiB", "scale": 1, "workers": 1}, "scale=1,workers=1": {"cpu_exclusive": false,
+      "cpu_limit": null, "credits_per_hour": "1", "disabled": false, "disk_limit":
+      null, "is_cc": true, "memory_limit": "4 GiB", "scale": 1, "workers": 1}, "scale=1,workers=1,mem=4GiB":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "1", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale":
+      1, "workers": 1}, "scale=1,workers=1,mem=8GiB": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "1", "disabled": false, "disk_limit": null, "is_cc":
+      true, "memory_limit": "8 GiB", "scale": 1, "workers": 1}, "scale=1,workers=1,mem=16GiB":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "1", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "16 GiB", "scale":
+      1, "workers": 1}, "scale=1,workers=1,mem=32GiB": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "1", "disabled": false, "disk_limit": null, "is_cc":
+      true, "memory_limit": "32 GiB", "scale": 1, "workers": 1}, "scale=1,workers=1,mem=1GiB":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "1", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "1 GiB", "scale":
+      1, "workers": 1}, "scale=1,workers=2": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "2", "disabled": false, "disk_limit": null, "is_cc":
+      true, "memory_limit": "4 GiB", "scale": 1, "workers": 2}, "scale=1,workers=2,mem=4GiB":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "2", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale":
+      1, "workers": 2}, "scale=1,workers=2,mem=8GiB": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "2", "disabled": false, "disk_limit": null, "is_cc":
+      true, "memory_limit": "8 GiB", "scale": 1, "workers": 2}, "scale=1,workers=2,mem=16GiB":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "2", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "16 GiB", "scale":
+      1, "workers": 2}, "scale=1,workers=2,mem=32GiB": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "2", "disabled": false, "disk_limit": null, "is_cc":
+      true, "memory_limit": "32 GiB", "scale": 1, "workers": 2}, "scale=2,workers=1":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "2", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale":
+      2, "workers": 1}, "scale=2,workers=2": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "4", "disabled": false, "disk_limit": null, "is_cc":
+      true, "memory_limit": "4 GiB", "scale": 2, "workers": 2}, "scale=1,workers=2,mem=2GiB":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "2", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "2 GiB", "scale":
+      1, "workers": 2}, "scale=1,workers=4": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "4", "disabled": false, "disk_limit": null, "is_cc":
+      true, "memory_limit": "4 GiB", "scale": 1, "workers": 4}, "scale=1,workers=4,mem=4GiB":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "4", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale":
+      1, "workers": 4}, "scale=1,workers=4,mem=8GiB": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "4", "disabled": false, "disk_limit": null, "is_cc":
+      true, "memory_limit": "8 GiB", "scale": 1, "workers": 4}, "scale=1,workers=4,mem=16GiB":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "4", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "16 GiB", "scale":
+      1, "workers": 4}, "scale=1,workers=4,mem=32GiB": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "4", "disabled": false, "disk_limit": null, "is_cc":
+      true, "memory_limit": "32 GiB", "scale": 1, "workers": 4}, "scale=4,workers=1":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "4", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale":
+      4, "workers": 1}, "scale=4,workers=4": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "16", "disabled": false, "disk_limit": null, "is_cc":
+      true, "memory_limit": "4 GiB", "scale": 4, "workers": 4}, "scale=1,workers=8":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "8", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale":
+      1, "workers": 8}, "scale=1,workers=8,mem=4GiB": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "8", "disabled": false, "disk_limit": null, "is_cc":
+      true, "memory_limit": "4 GiB", "scale": 1, "workers": 8}, "scale=1,workers=8,mem=8GiB":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "8", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "8 GiB", "scale":
+      1, "workers": 8}, "scale=1,workers=8,mem=16GiB": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "8", "disabled": false, "disk_limit": null, "is_cc":
+      true, "memory_limit": "16 GiB", "scale": 1, "workers": 8}, "scale=1,workers=8,mem=32GiB":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "8", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "32 GiB", "scale":
+      1, "workers": 8}, "scale=8,workers=1": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "8", "disabled": false, "disk_limit": null, "is_cc":
+      true, "memory_limit": "4 GiB", "scale": 8, "workers": 1}, "scale=8,workers=8":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "64", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale":
+      8, "workers": 8}, "scale=1,workers=16": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "16", "disabled": false, "disk_limit": null, "is_cc":
+      true, "memory_limit": "4 GiB", "scale": 1, "workers": 16}, "scale=1,workers=16,mem=4GiB":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "16", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale":
+      1, "workers": 16}, "scale=1,workers=16,mem=8GiB": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "16", "disabled": false, "disk_limit": null, "is_cc":
+      true, "memory_limit": "8 GiB", "scale": 1, "workers": 16}, "scale=1,workers=16,mem=16GiB":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "16", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "16 GiB", "scale":
+      1, "workers": 16}, "scale=1,workers=16,mem=32GiB": {"cpu_exclusive": false,
+      "cpu_limit": null, "credits_per_hour": "16", "disabled": false, "disk_limit":
+      null, "is_cc": true, "memory_limit": "32 GiB", "scale": 1, "workers": 16}, "scale=16,workers=1":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "16", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale":
+      16, "workers": 1}, "scale=16,workers=16": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "256", "disabled": false, "disk_limit": null, "is_cc":
+      true, "memory_limit": "4 GiB", "scale": 16, "workers": 16}, "scale=1,workers=32":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "32", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale":
+      1, "workers": 32}, "scale=1,workers=32,mem=4GiB": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "32", "disabled": false, "disk_limit": null, "is_cc":
+      true, "memory_limit": "4 GiB", "scale": 1, "workers": 32}, "scale=1,workers=32,mem=8GiB":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "32", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "8 GiB", "scale":
+      1, "workers": 32}, "scale=1,workers=32,mem=16GiB": {"cpu_exclusive": false,
+      "cpu_limit": null, "credits_per_hour": "32", "disabled": false, "disk_limit":
+      null, "is_cc": true, "memory_limit": "16 GiB", "scale": 1, "workers": 32}, "scale=1,workers=32,mem=32GiB":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "32", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "32 GiB", "scale":
+      1, "workers": 32}, "scale=32,workers=1": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "32", "disabled": false, "disk_limit": null, "is_cc":
+      true, "memory_limit": "4 GiB", "scale": 32, "workers": 1}, "scale=32,workers=32":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "1024", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale":
+      32, "workers": 32}}'
+    - MZ_BOOTSTRAP_DEFAULT_CLUSTER_REPLICA_SIZE=bootstrap
+    - MZ_BOOTSTRAP_BUILTIN_SYSTEM_CLUSTER_REPLICA_SIZE=bootstrap
+    - MZ_BOOTSTRAP_BUILTIN_PROBE_CLUSTER_REPLICA_SIZE=bootstrap
+    - MZ_BOOTSTRAP_BUILTIN_SUPPORT_CLUSTER_REPLICA_SIZE=bootstrap
+    - MZ_BOOTSTRAP_BUILTIN_CATALOG_SERVER_CLUSTER_REPLICA_SIZE=bootstrap
+    - MZ_BOOTSTRAP_BUILTIN_ANALYTICS_CLUSTER_REPLICA_SIZE=bootstrap
+    - MZ_BOOTSTRAP_BUILTIN_SYSTEM_CLUSTER_REPLICATION_FACTOR=1
+    - MZ_BOOTSTRAP_BUILTIN_PROBE_CLUSTER_REPLICATION_FACTOR=1
+    - MZ_BOOTSTRAP_DEFAULT_CLUSTER_REPLICATION_FACTOR=1
+    - COCKROACH_ENGINE_MAX_SYNC_DURATION_DEFAULT=120s
+    - COCKROACH_LOG_MAX_SYNC_DURATION=120s
+    - MZ_SYSTEM_PARAMETER_DEFAULT=unsafe_enable_unsafe_functions=true;allow_real_time_recency=true;constraint_based_timestamp_selection=verify;enable_compute_peek_response_stash=true;enable_0dt_deployment_panic_after_timeout=true;enable_0dt_deployment_sources=true;enable_alter_swap=true;enable_case_literal_transform=false;enable_cast_elimination=true;enable_coalesce_case_transform=true;enable_columnar_lgalloc=false;enable_columnation_lgalloc=false;enable_compute_correction_v2=true;enable_compute_logical_backpressure=true;enable_connection_validation_syntax=true;enable_copy_to_expr=true;enable_copy_from_remote=true;enable_create_table_from_source=true;enable_eager_delta_joins=true;enable_envelope_debezium_in_subscribe=true;enable_expressions_in_limit_syntax=true;enable_iceberg_sink=true;enable_introspection_subscribes=true;enable_kafka_sink_partition_by=true;enable_lgalloc=false;enable_load_generator_counter=true;enable_logical_compaction_window=true;enable_multi_worker_storage_persist_sink=true;enable_multi_replica_sources=true;enable_rbac_checks=true;enable_reduce_mfp_fusion=true;enable_refresh_every_mvs=true;enable_replacement_materialized_views=true;enable_cluster_schedule_refresh=true;enable_sql_server_source=true;enable_s3_tables_region_check=false;enable_statement_lifecycle_logging=true;enable_storage_introspection_logs=true;enable_compute_temporal_bucketing=true;enable_variadic_left_join_lowering=true;enable_worker_core_affinity=true;grpc_client_http2_keep_alive_timeout=5s;ore_overflowing_behavior=panic;unsafe_enable_table_keys=true;with_0dt_deployment_max_wait=1800s;persist_next_listen_batch_retryer_clamp=16s;persist_next_listen_batch_retryer_initial_backoff=100ms;persist_next_listen_batch_retryer_fixed_sleep=1200ms;persist_enable_arrow_lgalloc_noncc_sizes=true;persist_enable_s3_lgalloc_noncc_sizes=true;compute_correction_v2_chain_proportionality=3;compute_correction_v2_chunk_size=8192;compute_dataflow_max_inflight_bytes=134217728;compute_hydration_concurrency=2;compute_replica_expiration_offset=3d;compute_apply_column_demands=true;compute_peek_response_stash_threshold_bytes=1048576;compute_subscribe_snapshot_optimization=true;enable_compute_sync_mv_sink=true;enable_password_auth=true;enable_frontend_peek_sequencing=true;enable_frontend_subscribes=true;enable_upsert_v2=false;default_timestamp_interval=1s;force_source_table_syntax=false;persist_batch_columnar_format=structured;persist_batch_delete_enabled=true;persist_batch_structured_order=true;persist_batch_builder_structured=true;persist_batch_structured_key_lower_len=256;persist_batch_max_run_len=4;persist_catalog_force_compaction_fuel=1024;persist_catalog_force_compaction_wait=1s;persist_stats_audit_percent=100;persist_stats_audit_panic=true;persist_encoding_enable_dictionary=true;persist_fast_path_limit=1000;persist_fast_path_order=true;persist_gc_use_active_gc=true;persist_gc_min_versions=16;persist_gc_max_versions=128000;persist_inline_writes_single_max_bytes=4096;persist_inline_writes_total_max_bytes=1048576;persist_pubsub_client_enabled=true;persist_pubsub_push_diff_enabled=true;persist_record_compactions=true;persist_record_schema_id=true;persist_rollup_use_active_rollup=true;persist_blob_target_size=16777216;persist_compaction_memory_bound_bytes=83886080;persist_enable_incremental_compaction=true;persist_use_critical_since_catalog=true;persist_use_critical_since_snapshot=false;persist_use_critical_since_source=false;persist_part_decode_format=arrow;persist_blob_cache_scale_with_threads=true;persist_state_update_lease_timeout=1s;persist_validate_part_bounds_on_read=false;persist_validate_part_bounds_on_write=false;statement_logging_default_sample_rate=1.0;statement_logging_max_data_credit=;statement_logging_max_sample_rate=1.0;statement_logging_target_data_rate=;storage_reclock_to_latest=true;storage_source_decode_fuel=100000;storage_statistics_collection_interval=1000;storage_statistics_interval=2000;storage_use_continual_feedback_upsert=true;default_cluster_replication_factor=1
+    - MZ_NO_EXTERNAL_CLUSTERD=1
+    - MZ_TIMESTAMP_ORACLE_URL=postgres://root@postgres-metadata:26257?options=--search_path=tsoracle
+    - MZ_NO_BUILTIN_POSTGRES=1
+    - MZ_NO_BUILTIN_COCKROACH=1
+    - MZ_ADAPTER_STASH_URL=postgres://root@postgres-metadata:26257?options=--search_path=adapter
+    - MZ_LISTENERS_CONFIG_PATH=/listeners_config
+    volumes:
+    - /home/mitch/src/customer/customer-materialize/materialize/src/materialized/ci/listener_configs/testdrive.json:/listeners_config
+    - mzdata:/mzdata
+    - mydata:/var/lib/mysql-files
+    - tmp:/share/tmp
+    - scratch:/scratch
+    tmpfs:
+    - /tmp
+    healthcheck:
+      test:
+      - CMD
+      - curl
+      - -f
+      - localhost:6878/api/readyz
+      interval: 1s
+      start_period: 600s
+    stop_grace_period: 120s
+    platform: linux/amd64
+    image: materialize/materialized:latest
+  workload:
+    depends_on:
+      materialized:
+        condition: service_healthy
+      redpanda:
+        condition: service_healthy
+    environment:
+    - PGHOST=materialized
+    - PGPORT=6875
+    - PGUSER=materialize
+    - KAFKA_BROKER=kafka:9092
+    - SCHEMA_REGISTRY_URL=http://schema-registry:8081
+    platform: linux/amd64
+    image: materialize-workload:latest
+networks: {}
+volumes:
+  mzdata: null
+  pgdata: null
+  mysqldata: null
+  mssqldata: null
+  sourcedata_512Mb:
+    driver_opts:
+      device: tmpfs
+      type: tmpfs
+      o: size=512m
+  mydata: null
+  tmp: null
+  secrets: null
+  scratch: null
diff --git a/antithesis/scratchbook/bug-candidates.md b/antithesis/scratchbook/bug-candidates.md
new file mode 100644
index 0000000000000..f90d8b377a706
--- /dev/null
+++ b/antithesis/scratchbook/bug-candidates.md
@@ -0,0 +1,161 @@
+# Bug Candidates for Antithesis Reproduction
+
+Bugs found by mining the Materialize git history for timing/concurrency fixes
+that Antithesis's deterministic scheduling would reliably find.
+
+## 1. Persist Lease Race (Best Candidate)
+
+**Commit**: `43f024da36` — "persist: Make sure to obtain a lease before selecting a batch"
+**PR**: #35554
+**Severity**: Production incident — read-time halt
+**Category**: TOCTOU race
+
+### The Bug
+
+Persist uses "seqno leases" to prevent GC from deleting batches a reader is
+still processing. Before the fix, readers selected a batch *then* obtained a
+lease. GC could delete the batch in between:
+
+```
+Reader                              GC
+──────                              ──
+1. snapshot() at SeqNo 5
+   → picks BatchA (blob: part-0001)
+                                    2. Compaction merges BatchA away → SeqNo 6
+                                    3. seqno_since advances (no lease on 5)
+                                    4. Deletes part-0001 from blob storage
+5. lease_seqno() → SeqNo 7 (too late)
+6. fetch(BatchA) → 404 → HALT
+```
+
+The fix reorders to: lease first, then select batch. The lease prevents GC
+from advancing past the leased SeqNo.
+
+### Code Paths Affected
+
+- `Listen::next` (read.rs:287) — continuous feed that hydrates MVs. Runs in
+  the background for every materialized view with an active source. This is the
+  most natural trigger — always active, exercises the lease path on every new
+  batch.
+- `snapshot_cursor` (read.rs:1176) — used by "persist peeks" (SELECT on
+  unindexed tables). Less common than the listen path.
+- `snapshot_and_fetch` (read.rs:889) — used by catalog ops and txn-WAL reads.
+
+All three now go through `snapshot_batches()` (read.rs:846), which does
+lease-then-snapshot.
+
+### Workload to Trigger
+
+Simple mixed read/write traffic exercises the listen path:
+- Continuous INSERTs into a table (creates new batches → SeqNo churn → GC pressure)
+- A materialized view over that table (its listen is always running)
+- Concurrent SELECTs on the MV (served from in-memory arrangements, but
+  the listen feeding the MV is the actual race target)
+
+Compaction and GC run automatically in the background. Antithesis's scheduler
+can interleave GC between batch selection and lease acquisition.
+
+### Properties
+
+- `persist-cas-monotonicity` — batch data should never disappear
+- `critical-reader-fence-linearization` — leases should protect batches
+- Workload-side: reads never hang or error unexpectedly
+- SUT-side: the panic at read.rs:864 fires if a batch is missing after the
+  upper advanced (added by the fix — would need to be preserved in a
+  revert-and-detect test)
+
+### Testing Notes
+
+A pure `git revert` of `43f024da36` removes both the fix AND the panic that
+detects the impossible state. To validate, surgically revert only the ordering
+(put lease back after snapshot) while keeping the panic or replacing it with
+`assert_unreachable!`.
+
+---
+
+## 2. Compute Dependency Frontier Race
+
+**Commit**: `42a22b7ff5` — "compute: fix a race condition in collecting dependency frontiers"
+**Severity**: Compute controller panic
+**Category**: TOCTOU — check-then-act across async boundary
+
+### The Bug
+
+The compute controller checked whether storage collections existed, then
+collected their frontiers in a second step. Collections could be dropped
+between the two steps:
+
+```
+Step 1: check_exists(collection_id) → true
+                                          storage drops collection_id
+Step 2: collections_frontiers([collection_id]) → panic! missing key
+```
+
+Fix: replaced the two-step check-then-read with a single
+`collection_frontiers(id).ok()` that handles missing collections atomically.
+
+**File**: `src/compute-client/src/controller/instance.rs`
+
+### Workload to Trigger
+
+Rapid concurrent DDL — CREATE/DROP of sources and MVs while the compute
+controller is resolving dependency frontiers.
+
+### Properties
+
+- `compute-replica-epoch-isolation`
+- System should never panic from DDL operations
+
+---
+
+## 3. Reclock Upper Race with as_of
+
+**Commit**: `e3805ad790` — "Fetch latest upper in reclock to avoid races with as_of"
+**Severity**: Panic (fixes database-issues#8698)
+**Category**: Stale cached value in timing-sensitive decision
+
+A cached `upper` became stale between caching and `as_of` calculation, causing
+panic when `as_of > upper`.
+
+### Properties
+
+- `strict-serializable-reads`
+
+---
+
+## 4. MV-Sink Discarding Valid Batch Descriptions
+
+**Commit**: `0886c94dc2` — "mv-sink: stop discarding valid batch descriptions"
+**Severity**: Silent data loss
+**Category**: Stale frontier view
+
+Incorrect persist frontier view caused valid batch descriptions to be rejected
+as "outdated." No crash, no error — just silently dropped data.
+
+### Properties
+
+- `mv-reflects-source-updates`
+
+---
+
+## 5. Introspection Collection Frontier Regression
+
+**Commit**: `ec4f8996bb` — "compute: avoid frontier regressions for introspection collections"
+**Severity**: Frontier monotonicity violation
+**Category**: Initialization ordering mismatch
+
+### Properties
+
+- `persist-cas-monotonicity`
+
+---
+
+## 6. as_of Selection Upper Constraint Bugs
+
+**Commit**: `e6ca4801fa` — "as_of_selection: fix two bugs around upper constraints"
+**Severity**: 0dt upgrade availability blocked
+**Category**: Incorrect boundary calculation
+
+### Properties
+
+- `deployment-promotion-safety`
diff --git a/antithesis/scratchbook/deployment-topology.md b/antithesis/scratchbook/deployment-topology.md
new file mode 100644
index 0000000000000..b03f0aa469449
--- /dev/null
+++ b/antithesis/scratchbook/deployment-topology.md
@@ -0,0 +1,157 @@
+# Deployment Topology: Materialize
+
+## Approach: mzcompose-Generated Docker Compose
+
+The most straightforward path is to use Materialize's **mzcompose** framework to generate the Docker Compose configuration for Antithesis. mzcompose already defines all the service classes, health checks, environment variables, and dependencies needed to run a complete Materialize test environment.
+
+**Strategy**: Write an `mzcompose.py` file that defines the Antithesis test topology, use mzcompose to generate the Docker Compose YAML, then adapt it for Antithesis (adding test template mounts).
+
+## Topology Overview
+
+```
++---------------------+      +---------------------+
+| workload-client     | ---> | materialized        |
+| (test driver,       | <--- | (environmentd +     |
+|  Antithesis SDK,    |      |  embedded clusterd)  |
+|  test templates)    |      |                     |
++---------------------+      +---------+-----------+
+                                       |
+                    +------------------+------------------+
+                    |                  |                  |
+                    v                  v                  v
+          +----------------+  +----------------+  +----------------+
+          | postgres-       |  | minio          |  | redpanda       |
+          | metadata        |  | (blob storage) |  | (Kafka-compat) |
+          | (consensus)     |  |                |  |                |
+          +----------------+  +----------------+  +----------------+
+```
+
+## Container Specifications
+
+### 1. postgres-metadata (Dependency)
+
+| | |
+|---|---|
+| **Role** | Metadata store / consensus for persist and catalog |
+| **Image** | `postgres:16` (or mzcompose's `PostgresMetadata` service) |
+| **Why** | Default metadata store in modern mzcompose. Lighter than CockroachDB. Sufficient for single-node testing. |
+| **Ports** | 5432 |
+| **Health check** | `pg_isready -U postgres` |
+| **Network connections** | materialized reads/writes catalog and persist consensus |
+| **Replicas** | 1 |
+
+PostgreSQL is the default metadata store in modern Materialize testing (`EXTERNAL_METADATA_STORE=postgres-metadata`). CockroachDB is an alternative but adds complexity and state space without benefit for single-coordinator testing.
+
+### 2. minio (Dependency)
+
+| | |
+|---|---|
+| **Role** | S3-compatible blob storage for persist data |
+| **Image** | `minio/minio` (or mzcompose's `Minio` with `setup_materialize=True`) |
+| **Why** | Persist stores all durable data (source data, MV data, catalog snapshots) in blob storage. MinIO is the standard test substitute for S3. |
+| **Ports** | 9000 (S3 API), 9001 (console) |
+| **Health check** | `curl --fail http://localhost:9000/minio/health/live` |
+| **Network connections** | materialized writes/reads persist blobs |
+| **Replicas** | 1 |
+| **Config** | Pre-create `/data/persist` bucket. `MINIO_STORAGE_CLASS_STANDARD=EC:0` |
+
+### 3. redpanda (Dependency)
+
+| | |
+|---|---|
+| **Role** | Kafka-compatible message broker for stream source ingestion |
+| **Image** | `redpandadata/redpanda` (or mzcompose's `Redpanda` service) |
+| **Why** | Enables testing the Kafka source ingestion path, which is the most common production use case. Redpanda is lighter than Kafka+Zookeeper and includes a built-in Schema Registry. |
+| **Ports** | 9092 (Kafka API), 8081 (Schema Registry) |
+| **Health check** | `rpk cluster health` |
+| **Network connections** | materialized reads source data; workload-client may produce test data |
+| **Replicas** | 1 |
+
+### 4. materialized (Service — SUT)
+
+| | |
+|---|---|
+| **Role** | The system under test. Runs environmentd (coordinator) with embedded clusterd (compute/storage workers). |
+| **Image** | `materialized` (mzcompose's `Materialized` service, built via `mzbuild`) |
+| **Why** | This is the core SUT. The embedded clusterd mode runs everything in one process, simplifying the topology while still exercising all three layers (adapter, compute, storage). |
+| **Ports** | 6875 (pgwire), 6876-6878 (API/admin), 6879 (persist pubsub), 26257 (pg-compat) |
+| **Health check** | `curl -f localhost:6878/api/readyz` (interval 1s, start_period 600s) |
+| **Network connections** | postgres-metadata (consensus), minio (blob), redpanda (sources) |
+| **Replicas** | 1 |
+| **Key environment** | `MZ_NO_TELEMETRY=1`, `MZ_SOFT_ASSERTIONS=1`, `MZ_CATALOG_STORE=persist`, `MZ_BOOTSTRAP_ROLE=materialize`, `MZ_UNSAFE_MODE=1` |
+| **Key command args** | `--unsafe-mode`, `--persist-blob-url=s3://minioadmin:minioadmin@persist/persist?endpoint=http://minio:9000/&region=minio`, `--environment-id=...` |
+| **Depends on** | postgres-metadata, minio |
+
+**Design decision**: Use embedded clusterd (single process) rather than separate clusterd containers. This reduces state space while still exercising all code paths. Separate clusterd testing can be added as a second topology later.
+
+### 5. workload-client (Client — Test Driver)
+
+| | |
+|---|---|
+| **Role** | Runs Antithesis test commands. Emits `setup_complete`. Contains test templates. |
+| **Image** | Custom image built on top of testdrive or a Python-based client |
+| **Why** | Exercises the system via SQL (pgwire), produces Kafka messages, and asserts properties via the Antithesis SDK. |
+| **Ports** | None exposed |
+| **Network connections** | materialized (pgwire:6875), redpanda (Kafka:9092, SR:8081) |
+| **Replicas** | 1 |
+| **Test template mount** | `/opt/antithesis/test/v1/materialize/` |
+
+The workload client needs:
+1. PostgreSQL client library (psycopg2 or psql) to issue SQL
+2. Kafka producer library to push test data
+3. Antithesis Python SDK for assertions and lifecycle signals
+4. Test command scripts with appropriate prefixes (`first_`, `parallel_driver_`, `eventually_`, `finally_`)
+
+## SDK Selection
+
+| Component | Language | SDK Needed |
+|-----------|----------|------------|
+| workload-client | Python | `antithesis-sdk` Python package — for assertions, lifecycle signals |
+| materialized (optional, future) | Rust | `antithesis-sdk` Rust crate — for SUT-side reachability/safety assertions |
+
+The workload client **must** have the SDK for emitting assertions. SUT-side Rust SDK instrumentation is optional but recommended for deeper coverage of internal invariants (persist CaS correctness, frontier monotonicity, catalog consistency).
+
+## mzcompose Integration Path
+
+### Option A: Static Docker Compose (Recommended for v1)
+
+1. Write an `mzcompose.py` that defines the topology above
+2. Run `mzcompose --find antithesis gen-docker-compose` (or equivalent) to emit YAML
+3. Add any Antithesis-specific adaptations as needed
+4. Place the resulting `docker-compose.yml` in `guest/opt/materialize/`
+
+### Option B: Dynamic mzcompose (Future)
+
+1. Package the entire mzcompose framework into the workload-client image
+2. Use a `first_` test command to generate and start the compose topology
+3. More flexible but more complex; requires mzcompose to work inside Antithesis
+
+Option A is the pragmatic choice. It generates a compose file that Antithesis can directly manage.
+
+## Workload Design (High Level)
+
+Test commands in `/opt/antithesis/test/v1/materialize/`:
+
+| Command | Type | Purpose |
+|---------|------|---------|
+| `first_setup.sh` | first_ | Create sources, materialized views, tables. Establish baseline state. |
+| `parallel_driver_sql_workload.py` | parallel_driver_ | Continuously run SQL operations: INSERTs, SELECTs, CREATE/DROP views. Assert consistency properties. |
+| `parallel_driver_kafka_producer.py` | parallel_driver_ | Produce messages to Kafka topics. Verify they appear in materialized views. |
+| `eventually_consistency_check.py` | eventually_ | Verify that all acknowledged writes are visible in materialized views. |
+| `finally_invariant_check.py` | finally_ | Final consistency sweep: compare source data with MV contents. |
+| `anytime_health_check.sh` | anytime_ | Verify system health endpoint and basic SQL connectivity. |
+
+## Assumptions
+
+- Embedded clusterd (single process) is sufficient for initial testing
+- PostgreSQL is the preferred metadata store (simpler than CockroachDB)
+- Redpanda is preferred over Kafka+Zookeeper (lighter, built-in schema registry)
+- The workload client will be Python-based (leveraging existing testdrive patterns)
+- Static Docker Compose generation (Option A) is the right starting point
+
+## Open Questions
+
+- Should we also test with external clusterd processes (separate compute replicas)?
+- Should materialized be subject to fault injection, or only the network between it and dependencies?
+- What is the best base image for the workload client — extend the existing testdrive image or build from scratch?
+- Should the workload client use testdrive's `.td` format or raw SQL via psycopg?
diff --git a/antithesis/scratchbook/existing-assertions.md b/antithesis/scratchbook/existing-assertions.md
new file mode 100644
index 0000000000000..8e423c26a0415
--- /dev/null
+++ b/antithesis/scratchbook/existing-assertions.md
@@ -0,0 +1,37 @@
+# Existing Antithesis SDK Assertions
+
+## Summary
+
+**No Antithesis SDK assertions exist in the Materialize source code.**
+
+A comprehensive search of the Rust codebase at `materialize/src/` found:
+
+- No `use antithesis` import statements
+- No Cargo.toml dependencies on any antithesis crate
+- No assertion macros: `assert_always!`, `assert_sometimes!`, `assert_reachable!`, `assert_unreachable!`
+- No antithesis function calls in the Python test code within the materialize repository
+
+## Existing Antithesis Integration (Customer Level)
+
+Antithesis integration exists at the **customer-repo level** (outside the materialize source), using the legacy experiment-script approach:
+
+### Experiment Scripts (`guest/opt/antithesis/experiment/`)
+
+- **`materialize.py`**: Docker Compose-based experiment. Uses `antithesis.start_customer_containers()`, `antithesis.start_fault_injector()`, `antithesis.run_process()`, `antithesis.fuzz_msg()`, `antithesis.end_test()`. Orchestrates testdrive workloads with network chaos (latency, packet loss, partitions).
+- **`testdrive.py`**: K8s-based variant. Sets up k3s cluster with minio, redpanda, postgres, environmentd. Runs testdrive via kubectl.
+- **`materialize-k8s.sh`**: Bash setup for K8s resources.
+
+### Docker Compose Topology (`guest/opt/materialize/docker-compose.yml`)
+
+Uses custom Antithesis-instrumented images:
+- `antithesis-cp-combined` (Kafka + Schema Registry)
+- `antithesis-materialized` (Materialize)
+- `antithesis-testdrive` (Test workload)
+
+### K8s Manifests (`guest/opt/materialize/k8s/antithesis/`)
+
+Full Kubernetes topology: environmentd StatefulSet, postgres StatefulSet, redpanda Deployment, testdrive Pod, with PVs and services.
+
+## Implications for New Work
+
+All property assertions will need to be added fresh. The existing integration provides a starting point for topology but uses an older approach (experiment scripts, custom instrumented images). The new approach should leverage mzcompose for compose generation and add Antithesis SDK assertions either in the workload client or (for deeper coverage) in the Materialize Rust source.
diff --git a/antithesis/scratchbook/properties/catalog-recovery-consistency.md b/antithesis/scratchbook/properties/catalog-recovery-consistency.md
new file mode 100644
index 0000000000000..8b581a99adf60
--- /dev/null
+++ b/antithesis/scratchbook/properties/catalog-recovery-consistency.md
@@ -0,0 +1,33 @@
+# catalog-recovery-consistency
+
+## Summary
+After coordinator crash and restart, the catalog state is consistent: upper never decreases, snapshot is consolidated, all committed transactions visible.
+
+## Evidence
+
+### Code Paths
+- `src/catalog/src/durable/persist.rs:536-539` — `sync_to_current_upper`
+- `src/catalog/src/durable/persist.rs:575-577` — ListenEvent::Progress antichain logic
+- `src/catalog/src/durable/persist.rs:706-724` — `consolidate` method
+- `src/catalog/src/durable/persist.rs:593-612` — sync applies updates by timestamp, consolidates after each
+- `src/catalog/src/durable/persist.rs:1092` — Assertion on snapshot consolidation
+- `src/catalog/src/durable/persist.rs:1167-1170` — Fence token generation syncs to upper
+
+### How It Works
+On startup, the coordinator reads the persist shard from the latest rollup + incremental diffs, reconstructing the full catalog state. `sync_to_current_upper()` applies all updates up to the current upper antichain and consolidates the snapshot. The existing code has a debug assertion at line 1092 checking consolidation.
+
+### What Goes Wrong on Violation
+- Upper regression: coordinator sees older schema state than what was committed, losing recent DDL
+- Unconsolidated snapshot: duplicate entries cause incorrect catalog lookups, potential panics
+- Missing transactions: committed DDL not visible after restart, users lose tables/views
+
+### Key Subtlety
+Crash during `maybe_consolidate()` (lines 596, 610) could leave the snapshot in an intermediate state. On restart, the next sync must handle this gracefully by reconsolidating from the durable upper.
+
+### SUT-Side Instrumentation Notes
+- No existing Antithesis assertions (debug_assert exists at line 1092 but only in debug builds)
+- Candidate: After `sync_to_current_upper()`, add `assert_always!` that upper >= previous upper
+- Candidate: After consolidation, add `assert_always!` that no duplicate (kind, key) entries exist
+
+### Provenance
+Surfaced by Failure Recovery focus (merged from catalog-upper-monotonicity and catalog-snapshot-consolidation).
diff --git a/antithesis/scratchbook/properties/command-channel-ordering.md b/antithesis/scratchbook/properties/command-channel-ordering.md
new file mode 100644
index 0000000000000..0f47965189999
--- /dev/null
+++ b/antithesis/scratchbook/properties/command-channel-ordering.md
@@ -0,0 +1,28 @@
+# command-channel-ordering
+
+## Summary
+Timely workers must see CreateDataflow commands in identical order — code explicitly acknowledges this is not guaranteed by Timely.
+
+## Evidence
+
+### Code Paths
+- `src/compute/src/command_channel.rs:88-90` — Comment: "relies on Timely channels preserving order of inputs, which is not something they guarantee"
+- `src/compute/src/command_channel.rs:96-100` — Source operator activation sequence
+- `src/compute/src/command_channel.rs:41-58` — Sender using `Arc<Mutex>` activator
+
+### How It Works
+The command channel broadcasts commands from worker 0 to all other Timely workers via a Timely dataflow operator. Commands are fed in order, but the code explicitly notes that Timely does not guarantee preservation of input ordering.
+
+### What Goes Wrong on Violation
+Workers execute dataflows in different orders, causing divergent state. Since all workers must agree on dataflow state for correct results, reordering leads to inconsistent query results or panics during distributed computation.
+
+### Why This Is an Antithesis Target
+This is the kind of bug that almost never manifests in normal testing because thread scheduling is usually consistent. Antithesis's deterministic scheduling exploration can systematically vary worker activation timing to expose reordering.
+
+### SUT-Side Instrumentation Notes
+- No existing Antithesis assertions
+- Candidate: On each worker, log the command sequence and add `assert_always!` that worker N's command sequence matches worker 0's
+- This is a strong candidate for SUT-side instrumentation since the invariant is internal to the compute engine
+
+### Provenance
+Surfaced by Concurrency focus.
diff --git a/antithesis/scratchbook/properties/compute-replica-epoch-isolation.md b/antithesis/scratchbook/properties/compute-replica-epoch-isolation.md
new file mode 100644
index 0000000000000..019445cc28632
--- /dev/null
+++ b/antithesis/scratchbook/properties/compute-replica-epoch-isolation.md
@@ -0,0 +1,25 @@
+# compute-replica-epoch-isolation
+
+## Summary
+Compute replica incarnations are isolated by epoch — commands from old epochs cannot execute after a new epoch starts.
+
+## Evidence
+
+### Code Paths
+- `src/compute-client/src/controller/replica.rs:70-107` — Epoch at line 93, ReplicaTask at line 146
+- `src/compute-client/src/protocol/command.rs:45-54` — Hello command with nonce for protocol iteration
+- `src/compute-client/src/controller/replica.rs:142-144` — Task abortion on rehydration clears old commands
+
+### How It Works
+Each replica incarnation gets a unique epoch (nonce + u64). On rehydration, the controller aborts the old ReplicaTask and creates a new one with an incremented epoch. The Hello command includes the new nonce, and the replica rejects commands with mismatched nonces.
+
+### What Goes Wrong on Violation
+Stale commands from a previous incarnation execute on the new replica, causing it to diverge from the coordinator's expected state. Query results become inconsistent across replicas.
+
+### SUT-Side Instrumentation Notes
+- No existing Antithesis assertions
+- Candidate: On command receipt, add `assert_always!(command.epoch >= current_epoch)` in the replica's command handler
+- Candidate: After rehydration, add `assert_reachable!` that the new epoch is used for the first command
+
+### Provenance
+Surfaced by Distributed Coordination focus.
diff --git a/antithesis/scratchbook/properties/critical-reader-fence-linearization.md b/antithesis/scratchbook/properties/critical-reader-fence-linearization.md
new file mode 100644
index 0000000000000..5da820a7d464c
--- /dev/null
+++ b/antithesis/scratchbook/properties/critical-reader-fence-linearization.md
@@ -0,0 +1,24 @@
+# critical-reader-fence-linearization
+
+## Summary
+Critical reader opaque token comparison linearizes correctly — concurrent readers cannot bypass the fencing mechanism.
+
+## Evidence
+
+### Code Paths
+- `src/persist-client/src/internal/state.rs:1937-1979` — `compare_and_downgrade_since()` with opaque fencing
+- `src/persist-client/src/critical.rs` — `CriticalReaderId` and `Opaque` definitions
+
+### How It Works
+Critical readers hold a `since` frontier that prevents GC of data at held timestamps. The `compare_and_downgrade_since` operation uses an opaque token to fence: the caller provides `expected_opaque`, and if it doesn't match the current opaque in state, the operation fails (but still commits a SeqNo increment to prevent ABA). Only the caller with the correct opaque can advance the since.
+
+### What Goes Wrong on Violation
+If fencing is bypassed, two readers could both think they hold the since, leading to premature GC. Data needed by active readers is deleted, causing read failures or panics.
+
+### SUT-Side Instrumentation Notes
+- No existing Antithesis assertions
+- Candidate: After successful downgrade, add `assert_always!(state.opaque == my_opaque)` to confirm fencing
+- Candidate: On mismatch, add `assert_always!(seqno_advanced)` to confirm ABA prevention
+
+### Provenance
+Surfaced by Data Integrity focus.
diff --git a/antithesis/scratchbook/properties/deployment-lag-detection.md b/antithesis/scratchbook/properties/deployment-lag-detection.md
new file mode 100644
index 0000000000000..213c3dd2f904b
--- /dev/null
+++ b/antithesis/scratchbook/properties/deployment-lag-detection.md
@@ -0,0 +1,26 @@
+# deployment-lag-detection
+
+## Summary
+0DT caught-up check eventually detects lagging or crash-looping replicas and blocks promotion.
+
+## Evidence
+
+### Code Paths
+- `src/adapter/src/coord/caught_up.rs:53-150` — `maybe_check_caught_up` with replica frontier snapshot
+- `src/adapter/src/coord/caught_up.rs:127-136` — Lag comparison against allowed threshold
+- `src/adapter/src/coord/caught_up.rs:145-149` — `problematic_replicas` detection
+- Dynamic configs: `WITH_0DT_CAUGHT_UP_CHECK_ALLOWED_LAG`, `ENABLE_0DT_CAUGHT_UP_REPLICA_STATUS_CHECK`
+
+### How It Works
+Periodically during catchup, the coordinator queries `MZ_CLUSTER_REPLICA_FRONTIERS` and compares each replica's frontier against the expected threshold. If any replica's frontier lags beyond `allowed_lag`, promotion is blocked. Additionally, `analyze_replica_looping()` checks `mz_cluster_replica_status_history` for crash patterns.
+
+### What Goes Wrong on Violation
+If a stuck/crashing replica is not detected, promotion proceeds with an unhealthy replica. Post-promotion, queries routed to that replica fail or return stale results.
+
+### SUT-Side Instrumentation Notes
+- No existing Antithesis assertions
+- Candidate: Add `assert_sometimes!(lagging_replica_blocked_promotion)` to confirm the detection path is exercised
+- This is a liveness property — we want to confirm the system can detect the problem, not just that it doesn't happen
+
+### Provenance
+Surfaced by Lifecycle focus.
diff --git a/antithesis/scratchbook/properties/deployment-promotion-safety.md b/antithesis/scratchbook/properties/deployment-promotion-safety.md
new file mode 100644
index 0000000000000..e6794631a0aec
--- /dev/null
+++ b/antithesis/scratchbook/properties/deployment-promotion-safety.md
@@ -0,0 +1,26 @@
+# deployment-promotion-safety
+
+## Summary
+0DT deployment promotion happens only after all replicas have caught up to required frontiers.
+
+## Evidence
+
+### Code Paths
+- `src/environmentd/src/deployment/state.rs:92-108` — `set_ready_to_promote` transitions Initializing->CatchingUp->ReadyToPromote
+- `src/environmentd/src/deployment/preflight.rs:57-120` — `preflight_0dt` with `caught_up_max_wait` and `caught_up_trigger`
+- `src/adapter/src/coord/caught_up.rs:53-150` — Replica frontier checks via `MZ_CLUSTER_REPLICA_FRONTIERS`
+- `src/catalog/src/durable/error.rs:115-124` — `FenceError::DeployGeneration`
+
+### How It Works
+During 0DT deployment, the new coordinator boots in read-only mode. It runs preflight checks including `maybe_check_caught_up()` which compares replica frontiers against a cutoff threshold. Only after all replicas pass the check does the coordinator transition to ReadyToPromote. On promotion, the deployment generation is incremented, fencing out the old coordinator.
+
+### What Goes Wrong on Violation
+Premature promotion causes the new coordinator to serve queries while replicas are still rehydrating from storage. Users see stale data or timeouts. In the worst case, the old coordinator continues writing with a lower generation, causing split-brain.
+
+### SUT-Side Instrumentation Notes
+- No existing Antithesis assertions
+- Candidate: At promotion time, add `assert_always!` that all tracked replica frontiers >= cutoff
+- Candidate: Add `assert_reachable!("0dt_promotion_completed")` to confirm the promotion path is exercised
+
+### Provenance
+Surfaced by Lifecycle and Distributed Coordination focuses.
diff --git a/antithesis/scratchbook/properties/epoch-fencing-prevents-split-brain.md b/antithesis/scratchbook/properties/epoch-fencing-prevents-split-brain.md
new file mode 100644
index 0000000000000..3fb5167f9edf7
--- /dev/null
+++ b/antithesis/scratchbook/properties/epoch-fencing-prevents-split-brain.md
@@ -0,0 +1,35 @@
+# epoch-fencing-prevents-split-brain
+
+## Summary
+Epoch-based leader fencing prevents two coordinators from concurrently writing to the catalog persist shard.
+
+## Evidence
+
+### Code Paths
+- `src/catalog/src/durable/persist.rs:149-169` — `FenceableToken::validate()` and `maybe_fence()` check epoch on every write
+- `src/catalog/src/durable/persist.rs:393-461` — `compare_and_append` with fence validation before consensus write
+- `src/catalog/src/durable/error.rs:114-131` — `FenceError` enum: `DeployGeneration` and `Epoch` variants
+- `src/catalog/src/durable/persist.rs:1166-1192` — Fence token generation during `open_inner`
+- `src/environmentd/src/deployment/state.rs:24-123` — Deployment state machine transitions
+
+### How It Works
+On startup, the coordinator reads the current fence token from consensus and increments the epoch. The new token is written via CaS. All subsequent writes include the token; if consensus contains a higher epoch, the write fails with `FenceError::Epoch`.
+
+### What Goes Wrong on Violation
+Two coordinators with the same epoch could both write catalog mutations, leading to divergent schema state. Users would see inconsistent table definitions, lost DDL operations, or catalog corruption requiring manual intervention.
+
+### Failure Scenario
+1. Coordinator A is running with epoch 10
+2. Coordinator A becomes partitioned from consensus
+3. Coordinator B starts, reads epoch 10, increments to epoch 11
+4. Partition heals; A attempts to write with epoch 10
+5. **Expected**: A's write fails with FenceError
+6. **Bug**: If A's CaS succeeds despite lower epoch (race in validation)
+
+### SUT-Side Instrumentation Notes
+- No existing Antithesis assertions in codebase
+- Candidate instrumentation point: `FenceableToken::validate()` — add `assert_always!` that validates token comparison result matches expected fencing behavior
+- Candidate: `compare_and_append` success path — add `assert_always!` that current_epoch >= write_epoch
+
+### Provenance
+Surfaced independently by Distributed Coordination and Failure Recovery focuses.
diff --git a/antithesis/scratchbook/properties/fault-recovery-exercised.md b/antithesis/scratchbook/properties/fault-recovery-exercised.md
new file mode 100644
index 0000000000000..d6499991da5a6
--- /dev/null
+++ b/antithesis/scratchbook/properties/fault-recovery-exercised.md
@@ -0,0 +1,28 @@
+# fault-recovery-exercised
+
+## Summary
+After coordinator crash, the system eventually recovers and serves queries.
+
+## Evidence
+
+### Code Paths
+- `src/environmentd/src/environmentd/main.rs` — Main startup, catalog recovery
+- `src/environmentd/src/http/probe.rs` — `/health/ready` endpoint
+- `src/catalog/src/durable/persist.rs:1166-1192` — `open_inner` recovery path
+
+### How It Works
+On restart, environmentd re-reads the catalog from persist, increments the epoch, rehydrates compute/storage clusters, and starts accepting connections. The readiness probe (`/health/ready`) returns 200 only after the adapter is fully initialized.
+
+### What Goes Wrong on Violation
+The system fails to recover: it crashes on startup due to corrupt catalog state, enters an infinite restart loop, or becomes ready but cannot serve queries due to incomplete rehydration.
+
+### Why This Is a Property
+This is the most fundamental liveness property. It doesn't test a specific invariant — it tests that the entire recovery pipeline works end-to-end under adversarial crash timing.
+
+### SUT-Side Instrumentation Notes
+- No existing Antithesis assertions
+- Best verified at workload level: crash environmentd, wait for readiness, issue SELECT query, assert success
+- Candidate: Add `assert_sometimes!(recovery_completed_successfully)` after catalog recovery succeeds
+
+### Provenance
+Surfaced by Failure Recovery focus.
diff --git a/antithesis/scratchbook/properties/group-commit-toctou-safety.md b/antithesis/scratchbook/properties/group-commit-toctou-safety.md
new file mode 100644
index 0000000000000..bae54fcc085cc
--- /dev/null
+++ b/antithesis/scratchbook/properties/group-commit-toctou-safety.md
@@ -0,0 +1,28 @@
+# group-commit-toctou-safety
+
+## Summary
+No phantom writes to tables deleted between write deferral and group_commit execution.
+
+## Evidence
+
+### Code Paths
+- `src/adapter/src/coord/appends.rs:479-486` — Explicit TOCTOU check: "If the table... has been deleted while the write was deferred"
+- `src/adapter/src/coord/appends.rs:214-216` — `defer_op` enqueue point
+- `src/adapter/src/coord/appends.rs:394-399` — JIT lock acquisition in group_commit
+
+### How It Works
+When a write arrives and cannot immediately acquire the write lock, it is deferred. Later, group_commit processes deferred writes. Before applying each write, it checks `catalog().try_get_entry(table_id)`. If the table was dropped between deferral and execution, the write is silently dropped.
+
+### What Goes Wrong on Violation
+Writes land in a shard for a table that no longer exists in the catalog. This causes inconsistency between the catalog (table doesn't exist) and persist (shard has data). Downstream queries may panic or return garbage.
+
+### The TOCTOU Window
+The explicit comment at appends.rs:479 acknowledges the race. The window is between line 214 (write enqueued) and line 484 (catalog check during group_commit). Concurrent DDL (DROP TABLE) within this window is the trigger.
+
+### SUT-Side Instrumentation Notes
+- No existing Antithesis assertions
+- Candidate: After group_commit drops a deferred write, add `assert_reachable!("group_commit_dropped_deferred_write_to_deleted_table")` to confirm this path is exercised
+- Candidate: After group_commit succeeds, add `assert_always!` that all written table_ids still exist in catalog
+
+### Provenance
+Surfaced by Concurrency focus.
diff --git a/antithesis/scratchbook/properties/idempotent-write-under-indeterminate.md b/antithesis/scratchbook/properties/idempotent-write-under-indeterminate.md
new file mode 100644
index 0000000000000..0837770823d9a
--- /dev/null
+++ b/antithesis/scratchbook/properties/idempotent-write-under-indeterminate.md
@@ -0,0 +1,28 @@
+# idempotent-write-under-indeterminate
+
+## Summary
+Compare-and-append retries with the same idempotency token produce exactly one committed write — never duplicates, never loss.
+
+## Evidence
+
+### Code Paths
+- `src/persist-client/src/internal/machine.rs:387-468` — Detailed comments on Indeterminate error handling and retry-with-idempotency-token
+- `src/persist-client/src/internal/state.rs:1687` — `compare_and_append` function
+- `src/persist-client/src/write.rs:409` — Retry wrapper with `IdempotencyToken`
+- `src/persist-client/src/internal/state.rs:1715-1724` — Writer state and lease tracking
+
+### How It Works
+Each writer holds an `IdempotencyToken`. On Indeterminate error, the retry includes the same token. The state machine checks if a write with that token already succeeded (checking writer state). If so, it returns `AlreadyCommitted`. If not, it proceeds normally.
+
+### What Goes Wrong on Violation
+Duplicate writes: the shard contains two copies of the same batch, leading to double-counting in materialized views. Or lost writes: the batch is neither committed nor retried successfully, causing data loss.
+
+### Key Subtlety
+The comments at machine.rs:387-468 describe subtle scenarios where the writer must distinguish between "my write succeeded but I didn't get the ack" vs "my write failed and I need to retry." The IdempotencyToken is the mechanism, but the window between consensus write and state observation is where bugs hide.
+
+### SUT-Side Instrumentation Notes
+- No existing Antithesis assertions
+- Candidate: After Indeterminate retry, add `assert_always!` that shard trace contains exactly one instance of the batch
+
+### Provenance
+Surfaced by Data Integrity focus.
diff --git a/antithesis/scratchbook/properties/mv-reflects-source-updates.md b/antithesis/scratchbook/properties/mv-reflects-source-updates.md
new file mode 100644
index 0000000000000..a500f32fb1b0a
--- /dev/null
+++ b/antithesis/scratchbook/properties/mv-reflects-source-updates.md
@@ -0,0 +1,32 @@
+# mv-reflects-source-updates
+
+## Summary
+Materialized views eventually reflect changes to their source data.
+
+## Evidence
+
+### Code Paths
+- `src/compute/src/render/` — Dataflow rendering for materialized views
+- `src/compute/src/server.rs` — Compute server receives commands and renders dataflows
+- `src/adapter/src/coord/sequencer/` — CREATE MATERIALIZED VIEW sequencing
+
+### How It Works
+When source data changes, differential dataflow operators in the compute layer process the deltas and update the materialized view's persist shard. The MV's frontier advances as updates are committed.
+
+### What Goes Wrong on Violation
+MVs show stale data permanently despite source updates. Users query a materialized view expecting fresh data and get results that never update. This is the core value proposition failure.
+
+### Why This Is an End-to-End Property
+Unlike internal properties (epoch fencing, CaS monotonicity), this property is directly observable by users. It combines source ingestion, compute processing, and persist writes into a single check.
+
+### Workload Verification
+1. INSERT INTO table1 VALUES (1, 'test')
+2. Wait for MV that SELECTs from table1
+3. SELECT * FROM mv1 — must eventually contain (1, 'test')
+
+### SUT-Side Instrumentation Notes
+- Best verified at workload level via SQL assertions
+- Candidate: Add `assert_sometimes!(mv_frontier_advanced)` in the compute persist sink
+
+### Provenance
+Surfaced by Product Context focus.
diff --git a/antithesis/scratchbook/properties/peek-lifecycle-exactly-once.md b/antithesis/scratchbook/properties/peek-lifecycle-exactly-once.md
new file mode 100644
index 0000000000000..e0a3c6c682336
--- /dev/null
+++ b/antithesis/scratchbook/properties/peek-lifecycle-exactly-once.md
@@ -0,0 +1,35 @@
+# peek-lifecycle-exactly-once
+
+## Summary
+Each peek command produces exactly one response — no duplicates, no leaks, no orphaned state.
+
+## Evidence
+
+### Code Paths
+- `src/adapter/src/coord/peek.rs:80-95` — Explicit "1:1 contract between Peek and PeekResponseUnary" comment
+- `src/adapter/src/coord/peek.rs:873-920` — Response routing with UUID tracking
+- `src/adapter/src/coord/peek.rs:1174-1209` — `cancel_pending_peeks`: removes from client_pending_peeks then pending_peeks
+- `src/adapter/src/coord/peek.rs:1256-1268` — `remove_pending_peek`: consistency check between two maps
+- `src/adapter/src/coord/peek.rs:1221-1227` — `handle_peek_notification` removes before response
+
+### How It Works
+Peeks are tracked in two maps: `pending_peeks` (UUID -> PendingPeek) and `client_pending_peeks` (ConnectionId -> Set<UUID>). On response or cancellation, the peek is removed from both maps. Each UUID is unique (generated per-peek).
+
+### What Goes Wrong on Violation
+- Leaked peeks: UUID stays in pending_peeks forever, growing memory until OOM
+- Duplicate responses: client receives two result sets for one query
+- Missing responses: client hangs waiting for a peek that was silently dropped
+
+### The Race Condition
+The two-map removal (client_pending_peeks + pending_peeks) at lines 1256-1268 is not atomic. If CancelPendingPeeks races with PeekNotification:
+1. Cancel removes UUID from client_pending_peeks
+2. Peek response arrives, finds UUID in pending_peeks but not in client_pending_peeks
+3. Orphaned state or double-processing
+
+### SUT-Side Instrumentation Notes
+- No existing Antithesis assertions
+- Candidate: At coordinator shutdown or periodically, add `assert_always!(pending_peeks.is_empty() || active_connections_exist)` to detect leaks
+- Candidate: On peek response, add `assert_always!` that UUID existed in pending_peeks before removal
+
+### Provenance
+Surfaced by Protocol Contracts and Concurrency focuses.
diff --git a/antithesis/scratchbook/properties/persist-cas-monotonicity.md b/antithesis/scratchbook/properties/persist-cas-monotonicity.md
new file mode 100644
index 0000000000000..46ab8e6dd7bfe
--- /dev/null
+++ b/antithesis/scratchbook/properties/persist-cas-monotonicity.md
@@ -0,0 +1,34 @@
+# persist-cas-monotonicity
+
+## Summary
+Persist shard state versions (SeqNo) must never decrease across any observation point.
+
+## Evidence
+
+### Code Paths
+- `src/persist-client/src/internal/state_versions.rs:48-87` — State version invariants: `earliest <= current.seqno`
+- `src/persist-client/src/internal/state.rs:84-95` — `ROLLUP_THRESHOLD` and seqno-based rollup logic
+- `src/persist-client/src/internal/state.rs:1324` — Invariant comment on rollup seqno
+- `src/persist-client/src/internal/gc.rs` — GC respects seqno ordering
+- `src/persist-client/src/write.rs:70-123` — WriteHandle CaS loop context
+
+### How It Works
+Every state mutation increments SeqNo. The CaS loop in Machine reads current state, computes new state with SeqNo+1, and atomically writes via consensus. If another writer interleaved, the CaS fails and the writer retries with the newer SeqNo. Rollups periodically snapshot state; rollup seqno must be <= current seqno.
+
+### What Goes Wrong on Violation
+SeqNo regression means state reconstruction from rollup + diffs produces wrong state. GC could delete diffs that are still needed. Writers could overwrite each other's changes. This is a data corruption scenario.
+
+### Failure Scenario
+1. Writer A reads state at SeqNo 100, begins computing new state
+2. Writer B reads state at SeqNo 100, writes SeqNo 101
+3. Writer A attempts to write SeqNo 101 — CaS should fail (current is now 101)
+4. **Expected**: A retries, reads SeqNo 101, writes SeqNo 102
+5. **Bug**: If CaS comparison is stale and A's write at 101 succeeds despite B's 101
+
+### SUT-Side Instrumentation Notes
+- No existing Antithesis assertions
+- Candidate: `Machine::apply_unbatched_cmd` — add `assert_always!(new_seqno > old_seqno)` after every state transition
+- Candidate: State reconstruction from rollup + diffs — add `assert_always!` that reconstructed state matches expected
+
+### Provenance
+Surfaced by Data Integrity and Distributed Coordination focuses.
diff --git a/antithesis/scratchbook/properties/source-ingestion-progress.md b/antithesis/scratchbook/properties/source-ingestion-progress.md
new file mode 100644
index 0000000000000..aa3b83c54f9cd
--- /dev/null
+++ b/antithesis/scratchbook/properties/source-ingestion-progress.md
@@ -0,0 +1,27 @@
+# source-ingestion-progress
+
+## Summary
+Kafka source ingestion eventually makes progress — the source frontier advances.
+
+## Evidence
+
+### Code Paths
+- `src/storage/src/render/sources.rs` — Source operator assembly (Kafka, Postgres, MySQL connectors)
+- `src/storage/src/source/reclock.rs` — Timestamp reclocking from source timestamps to Materialize timeline
+- `src/storage/src/render/persist_sink.rs` — Writes ingested data to persist shards
+
+### How It Works
+Storage workers connect to external sources (Kafka brokers, Postgres replication slots), read data, reclock timestamps, and write to persist. The source's upper frontier advances as data is ingested and persisted.
+
+### What Goes Wrong on Violation
+Source stalls: materialized views stop updating, users see stale data indefinitely. This is the most visible user-facing failure mode for a streaming database.
+
+### Why This Is a Liveness Property
+We want to confirm the system reaches a state where source data is flowing. Under fault injection (network partitions to Kafka, storage worker crashes), the source should eventually resume and make progress.
+
+### SUT-Side Instrumentation Notes
+- Best verified at workload level: produce N messages to Kafka, query the source table, assert row count eventually reaches N
+- Candidate: Add `assert_sometimes!(source_frontier_advanced)` in the persist sink write path
+
+### Provenance
+Surfaced by Product Context focus.
diff --git a/antithesis/scratchbook/properties/storage-command-replay-idempotent.md b/antithesis/scratchbook/properties/storage-command-replay-idempotent.md
new file mode 100644
index 0000000000000..8046c29c5612e
--- /dev/null
+++ b/antithesis/scratchbook/properties/storage-command-replay-idempotent.md
@@ -0,0 +1,28 @@
+# storage-command-replay-idempotent
+
+## Summary
+Replaying storage command history after reconnection is idempotent — no duplicate ingestion or state divergence.
+
+## Evidence
+
+### Code Paths
+- `src/storage-controller/src/history.rs:20-80` — CommandHistory reduces and replays
+- `src/storage-controller/src/instance.rs:46-80` — Replica rehydration via command history
+- `src/storage-controller/src/persist_handles.rs:98-120` — Append retry semantics with Timestamp tracking
+
+### How It Works
+The storage controller maintains a command history for each replica. On reconnection, it replays the reduced history. The history is compacted to remove superseded commands (e.g., only the latest configuration for each source). Sources resume from persisted offsets in persist, not from the beginning.
+
+### What Goes Wrong on Violation
+Duplicate data appears in sources. Since materialized views are computed incrementally from sources, duplicates propagate to all downstream views. Users see incorrect aggregation results (double-counted rows).
+
+### Key Subtlety
+Command history compaction assumes idempotency, but no explicit duplicate detection is observed in the code. If a RunIngestionCommand is partially executed (source starts but crashes before position is persisted), replay could re-ingest data from the last persisted offset, which may differ from the actual last-processed offset.
+
+### SUT-Side Instrumentation Notes
+- No existing Antithesis assertions
+- Candidate: After replay, add `assert_always!` that source read position >= position before crash
+- Candidate: After ingestion resumes, add `assert_always!` comparing row counts with expected deduplication
+
+### Provenance
+Surfaced by Failure Recovery focus.
diff --git a/antithesis/scratchbook/properties/strict-serializable-reads.md b/antithesis/scratchbook/properties/strict-serializable-reads.md
new file mode 100644
index 0000000000000..450d623b4c6f3
--- /dev/null
+++ b/antithesis/scratchbook/properties/strict-serializable-reads.md
@@ -0,0 +1,34 @@
+# strict-serializable-reads
+
+## Summary
+Reads respect the timestamp oracle's linearization point — later reads see all changes visible to earlier reads.
+
+## Evidence
+
+### Code Paths
+- `src/adapter/src/coord/timestamp_selection.rs:40-52` — When `chosen_ts` differs from `oracle_ts`, peek results must be delayed until oracle catches up
+- `src/adapter/src/coord/sequencer/inner.rs:2097-2116` — Strict serializable reads tracked via `strict_serializable_reads_tx`
+- `src/adapter/src/coord/timestamp_selection.rs:228-240` — `needs_linearized_read_ts` check
+- `src/adapter/src/coord/in_memory_oracle.rs:92-101` — Oracle timestamp advancement
+
+### How It Works
+The coordinator assigns every read a timestamp from the oracle. The oracle maintains a monotonically advancing timestamp. Strict serializable reads wait for the oracle to confirm their timestamp is linearized before returning results. This ensures no read can see a state "in the past" relative to another concurrent read.
+
+### What Goes Wrong on Violation
+Users observe non-repeatable reads: query A at time T sees data that query B at time T+1 does not see. This violates the strict serializability contract that is Materialize's primary differentiator from other streaming systems.
+
+### Workload-Level Verification
+This property is best verified at the workload level:
+1. Client A writes row R and receives acknowledgment
+2. Client B reads and must see R (or a later state including R)
+3. Client C reads and must see at least what B saw
+
+The workload checks SQL results, not internal state.
+
+### SUT-Side Instrumentation Notes
+- No existing Antithesis assertions
+- Candidate: `timestamp_selection.rs` oracle advancement — add `assert_always!` that oracle timestamp never decreases
+- Candidate: After peek response, add workload-side `Always` assertion comparing read timestamp ordering with data ordering
+
+### Provenance
+Surfaced by Protocol Contracts focus (merged from timestamp-oracle-linearization and strict-serializable-ordering).
diff --git a/antithesis/scratchbook/properties/tombstone-sealing-finality.md b/antithesis/scratchbook/properties/tombstone-sealing-finality.md
new file mode 100644
index 0000000000000..bc97da01197ae
--- /dev/null
+++ b/antithesis/scratchbook/properties/tombstone-sealing-finality.md
@@ -0,0 +1,22 @@
+# tombstone-sealing-finality
+
+## Summary
+Once a shard is tombstoned (upper and since both empty antichain), no further mutations are possible.
+
+## Evidence
+
+### Code Paths
+- `src/persist-client/src/internal/state.rs:2128-2134` — `is_tombstone()` checks upper.is_empty() && since.is_empty() && writers.is_empty() && critical_readers.is_empty()
+- `src/persist-client/src/internal/state.rs:1703-1712` — compare_and_append short-circuits on tombstone
+- `src/persist-client/src/internal/state.rs:2146-2159` — `become_tombstone_and_shrink()` transition
+
+### What Goes Wrong on Violation
+If a tombstoned shard accepts new writes, deleted tables/views could have data resurrected. This would confuse users and violate the contract that DROP TABLE removes data permanently.
+
+### SUT-Side Instrumentation Notes
+- No existing Antithesis assertions
+- Candidate: After `is_tombstone()` returns true, add `assert_always!` that subsequent append attempts return error
+- Candidate: `become_tombstone_and_shrink()` — add `assert_unreachable!` after the transition if any subsequent mutation succeeds
+
+### Provenance
+Surfaced by Data Integrity focus.
diff --git a/antithesis/scratchbook/property-catalog.md b/antithesis/scratchbook/property-catalog.md
new file mode 100644
index 0000000000000..ffbba999a7031
--- /dev/null
+++ b/antithesis/scratchbook/property-catalog.md
@@ -0,0 +1,217 @@
+---
+commit: ca6deb6758e651876582ae7d4dec24ce32d87567
+updated: 2026-05-06
+---
+
+# Property Catalog: Materialize
+
+## Category 1: Data Integrity Under Faults
+
+Properties that verify data correctness when crashes, network partitions, and concurrent access interact with the persist layer and catalog.
+
+### epoch-fencing-prevents-split-brain — Epoch-Based Fencing Prevents Split-Brain Writes
+
+| | |
+|---|---|
+| **Type** | Safety |
+| **Priority** | P0 — fundamental split-brain prevention; failure here corrupts all state |
+| **Property** | After a coordinator restart with a higher epoch, the old coordinator (lower epoch) cannot successfully write to the catalog persist shard. |
+| **Invariant** | `Always`: once a higher epoch is written to consensus, any compare_and_append from a lower epoch must fail with FenceError. This is a strict safety invariant — every check must hold. |
+| **Antithesis Angle** | Network partition separates old coordinator from consensus while new coordinator starts with higher epoch. When partition heals, old coordinator's in-flight writes must be rejected. Antithesis explores the timing window between old coordinator's last successful write and new coordinator's first write. |
+| **Why It Matters** | Split-brain writes corrupt the catalog, potentially causing data loss or inconsistent schema state. This is the fundamental distributed safety mechanism. Surfaced by: Distributed Coordination, Failure Recovery. |
+
+### persist-cas-monotonicity — Persist SeqNo Never Decreases
+
+| | |
+|---|---|
+| **Type** | Safety |
+| **Priority** | P0 — backbone of persist consistency; all other persist properties depend on this |
+| **Property** | Persist shard state versions (SeqNo) form a strictly increasing sequence. No writer can observe or apply a lower SeqNo after observing a higher one. |
+| **Invariant** | `Always`: for any shard, if SeqNo N is observed, no subsequent observation returns SeqNo < N. Rollups maintain seqno <= seqno_since. This must hold on every check — a single violation means state corruption. |
+| **Antithesis Angle** | Partition storage from persist backend mid-write. One writer races to increment SeqNo while another caches an old value and retries. Crash during GC/rollup operations. Antithesis explores interleaving of concurrent CaS loops. |
+| **Why It Matters** | SeqNo monotonicity is the backbone of persist's consistency model. Violations cause state reconstruction failures and data loss. Surfaced by: Data Integrity, Distributed Coordination. |
+
+### tombstone-sealing-finality — Tombstoned Shards Are Immutable
+
+| | |
+|---|---|
+| **Type** | Safety |
+| **Priority** | P1 — prevents zombie writes to dropped collections |
+| **Property** | Once a shard's upper and since both advance to the empty antichain (tombstone), no new writes, reader registrations, or writer registrations can succeed. The transition is irreversible. |
+| **Invariant** | `Always`: after `is_tombstone()` returns true, any append, downgrade_since, or registration attempt must fail. The state machine must never revert from tombstone. |
+| **Antithesis Angle** | Crash and restart after tombstone. Fire concurrent write/read attempts while state is being replayed from consensus. Antithesis explores whether recovery code can accidentally un-tombstone a shard. |
+| **Why It Matters** | Tombstone finality prevents zombie writes to dropped collections. Violation could resurface deleted data. Surfaced by: Data Integrity. |
+
+### idempotent-write-under-indeterminate — Compare-and-Append Idempotency on Retry
+
+| | |
+|---|---|
+| **Type** | Safety |
+| **Priority** | P1 — indeterminate errors are the hardest distributed systems edge case |
+| **Property** | When compare_and_append receives an Indeterminate error from consensus and retries with the same idempotency token, the shard contains exactly one copy of the write — never zero, never two. |
+| **Invariant** | `Always`: after retry with identical IdempotencyToken, the shard's upper reflects exactly one successful write. Duplicate data must never appear in the shard trace. |
+| **Antithesis Angle** | Inject network failures on consensus calls mid-flight. Kill writer after batch is queued but before state is committed. Antithesis explores the window between consensus write and acknowledgment. |
+| **Why It Matters** | Indeterminate errors are the hardest to handle correctly in distributed systems. Duplication or loss here silently corrupts downstream materialized views. Surfaced by: Data Integrity. |
+
+## Category 2: Consistency Model Enforcement
+
+Properties that verify Materialize's strict serializability guarantee and timestamp oracle correctness.
+
+### strict-serializable-reads — Reads Respect Timestamp Oracle Linearization
+
+| | |
+|---|---|
+| **Type** | Safety |
+| **Priority** | P0 — Materialize's core advertised guarantee; user-visible |
+| **Property** | Two reads on the same collection at timestamps t1 < t2 (assigned by the oracle) must observe consistent ordering: if t1 sees state S, t2 cannot observe a state prior to S. |
+| **Invariant** | `Always`: for any two reads where oracle assigns t1 < t2, the result at t2 must include all changes visible at t1. The oracle read timestamp must advance monotonically. |
+| **Antithesis Angle** | Run parallel transactions in StrictSerializable mode. One writes, another reads concurrently. Inject delays in oracle timestamp advancement. Antithesis explores whether reads can bypass the linearization point. |
+| **Why It Matters** | Strict serializability is Materialize's core advertised guarantee. Users explicitly choose it over eventual consistency. Violation is a correctness bug visible to end users. Surfaced by: Protocol Contracts. |
+
+### catalog-recovery-consistency — Catalog State Consistent After Crash Recovery
+
+| | |
+|---|---|
+| **Type** | Safety |
+| **Priority** | P1 — catalog corruption on recovery prevents system from starting |
+| **Property** | After coordinator crash and restart, the recovered catalog state is equivalent to the pre-crash state: upper never decreases, snapshot is consolidated, and all committed transactions are visible. |
+| **Invariant** | `Always`: upper(post_restart) >= upper(pre_crash). After sync_to_current_upper(), the snapshot contains no unconsolidated entries (all diffs resolved). |
+| **Antithesis Angle** | Crash coordinator during catalog_transact (after some updates persist but before upper advances). Crash during consolidation. Antithesis explores the timing of crashes within the catalog write path. |
+| **Why It Matters** | Catalog inconsistency after recovery can cause schema corruption, lost DDL, or inability to restart. Surfaced by: Failure Recovery. |
+
+## Category 3: Compute and Storage Recovery
+
+Properties that verify correct behavior during and after process crashes in the compute and storage layers.
+
+### compute-replica-epoch-isolation — Stale Replica Commands Rejected After Rehydration
+
+| | |
+|---|---|
+| **Type** | Safety |
+| **Priority** | P1 — stale commands cause compute divergence and wrong query results |
+| **Property** | Each compute replica incarnation has a unique epoch (nonce + u64). After rehydration with epoch N+1, no commands from epoch N can execute or affect dataflow state. |
+| **Invariant** | `Always`: once a command with epoch N+1 is processed, all epoch N commands are dropped. The epoch forms a strict ordering on replica incarnations. |
+| **Antithesis Angle** | Kill compute replica mid-dataflow. Controller rehydrates with new epoch. In-flight commands from the old epoch leak back due to network buffering. Antithesis explores whether stale commands can sneak past the epoch check. |
+| **Why It Matters** | Stale command execution causes compute replicas to diverge from the coordinator's expected state, potentially returning wrong query results. Surfaced by: Distributed Coordination. |
+
+### storage-command-replay-idempotent — Storage Command History Replay Is Idempotent
+
+| | |
+|---|---|
+| **Type** | Safety |
+| **Priority** | P1 — non-idempotent replay causes data duplication in all downstream MVs |
+| **Property** | When a storage replica reconnects, the controller replays command history from the last frontier. Replaying the same commands twice yields identical state — no duplicated ingestion or state divergence. |
+| **Invariant** | `Always`: apply(history[0:i]) + apply(history[0:i]) == apply(history[0:i]). Source ingestion positions must resume from persisted offsets, not restart from zero. |
+| **Antithesis Angle** | Crash storage controller mid-send of RunIngestionCommand. Restart and replay history. Antithesis explores whether partial command delivery causes duplicate ingestion. |
+| **Why It Matters** | Non-idempotent replay causes duplicate data in sources, which propagates to all downstream materialized views. Surfaced by: Failure Recovery. |
+
+## Category 4: Concurrency and Race Conditions
+
+Properties that verify correctness under concurrent access patterns within the coordinator.
+
+### group-commit-toctou-safety — No Phantom Writes to Deleted Tables
+
+| | |
+|---|---|
+| **Type** | Safety |
+| **Priority** | P1 — phantom writes corrupt catalog; TOCTOU explicitly acknowledged in code |
+| **Property** | If a table is deleted between when a write is deferred and when group_commit executes, the write is silently dropped — not persisted. No phantom writes to non-existent tables. |
+| **Invariant** | `Always`: if catalog.try_get_entry(table_id) returns None at group_commit time, the write's updates are not included in the committed batch. |
+| **Antithesis Angle** | Concurrent table deletion + write operations. Antithesis delays between deferred write queuing and group_commit catalog check, exposing the TOCTOU window where the table ceases to exist between validation and execution. |
+| **Why It Matters** | Phantom writes to deleted tables corrupt the catalog or cause panics during downstream processing. The explicit TOCTOU check in appends.rs:479-486 acknowledges this risk. Surfaced by: Concurrency. |
+
+### peek-lifecycle-exactly-once — Each Peek Gets Exactly One Response
+
+| | |
+|---|---|
+| **Type** | Safety |
+| **Priority** | P1 — leaked peeks cause OOM; explicit 1:1 contract documented |
+| **Property** | For each peek command sent to compute, exactly one PeekResponse is delivered to the client — no duplicates, no missing responses, no orphaned pending_peeks entries. |
+| **Invariant** | `Always`: count(peek_commands) == count(peek_responses) with bijective UUID mapping. When CancelPendingPeeks races with PeekNotification, exactly one of (canceled, completed) occurs — never both, never neither. |
+| **Antithesis Angle** | Trigger replica failures mid-peek. Race cancel requests with response delivery. Antithesis explores the two-map removal sequence (client_pending_peeks + pending_peeks) that is not atomic. |
+| **Why It Matters** | Leaked peeks cause memory growth and eventually OOM. Duplicate responses confuse clients. The 1:1 contract is explicitly documented in peek.rs:80-95. Surfaced by: Protocol Contracts, Concurrency. |
+
+### command-channel-ordering — Timely Workers See Commands in Identical Order
+
+| | |
+|---|---|
+| **Type** | Safety |
+| **Priority** | P2 — code explicitly acknowledges ordering is unguaranteed; hard to trigger |
+| **Property** | CreateDataflow commands broadcast through the command channel execute in identical order across all Timely workers — no reordering. |
+| **Invariant** | `Always`: for any two workers W1 and W2, if W1 sees command A before B, W2 also sees A before B. Code comment at command_channel.rs:88-90 explicitly notes this relies on "Timely channels preserving order of inputs, which is not something they guarantee." |
+| **Antithesis Angle** | Inject timing delays in the source operator between command channel invocations. Stress the sync_activator bridge between sync and async contexts. Antithesis explores whether worker scheduling variations cause reordering. |
+| **Why It Matters** | Command reordering causes workers to diverge, producing inconsistent dataflow results. The code explicitly acknowledges this is unguaranteed. Surfaced by: Concurrency. |
+
+## Category 5: Lifecycle Transitions
+
+Properties about 0DT deployment, startup, and shutdown correctness.
+
+### deployment-promotion-safety — 0DT Promotion Only After Full Catchup
+
+| | |
+|---|---|
+| **Type** | Safety |
+| **Priority** | P2 — relevant for cloud deployments; requires multi-coordinator setup |
+| **Property** | During 0DT deployment, the new coordinator transitions to ReadyToPromote only after catalog is loaded, caught-up checks pass, and all replica frontiers have advanced past the required threshold. Promotion with stale replicas is prevented. |
+| **Invariant** | `Always`: at the moment set_ready_to_promote() is called, all collections tracked by caught_up checks have frontiers >= the cutoff threshold. The deployment generation fence prevents the old coordinator from writing after promotion. |
+| **Antithesis Angle** | Trigger preflight concurrently with replica startup. Crash replicas during catchup. Antithesis explores whether the caught_up check can pass while a replica is still lagging or crash-looping. |
+| **Why It Matters** | Premature promotion causes the new coordinator to serve stale data or fail to serve at all. This is the primary risk in zero-downtime deployments. Surfaced by: Lifecycle, Distributed Coordination. |
+
+### deployment-lag-detection — Caught-Up Check Detects Stuck Replicas
+
+| | |
+|---|---|
+| **Type** | Liveness |
+| **Priority** | P2 — companion to deployment-promotion-safety; requires 0DT setup |
+| **Property** | During 0DT catchup, maybe_check_caught_up() eventually detects replicas that are lagging beyond configured thresholds or crash-looping, and prevents promotion until resolved. |
+| **Invariant** | `Sometimes(lagging_replica_detected)`: Antithesis should observe at least one scenario where a lagging/crashing replica is detected and promotion is blocked. This is a liveness property — the detection must eventually happen. |
+| **Antithesis Angle** | Inject replica crashes during catchup phase. Verify the analyze_replica_looping() function identifies the problem via mz_cluster_replica_status_history. |
+| **Why It Matters** | Undetected stuck replicas during 0DT deployment lead to silent data staleness in production. Surfaced by: Lifecycle. |
+
+## Category 6: Reachability and Coverage
+
+Properties that verify the system reaches interesting states under fault injection.
+
+### fault-recovery-exercised — System Recovers from Coordinator Crash
+
+| | |
+|---|---|
+| **Type** | Liveness |
+| **Priority** | P0 — most fundamental operational property; prerequisite for all others |
+| **Property** | After the coordinator (environmentd) crashes and restarts, the system eventually becomes healthy (readiness endpoint returns 200) and can serve SQL queries. |
+| **Invariant** | `Sometimes(healthy_after_crash)`: the system must reach a state where it can serve queries after a crash. This confirms recovery works end-to-end, not just in unit tests. |
+| **Antithesis Angle** | Kill environmentd at various points during operation. Verify it restarts, reconnects to persist, recovers catalog, and serves queries. Antithesis explores crash timing — during DDL, during peek, during group_commit. |
+| **Why It Matters** | Recovery is the most critical operational property. If it doesn't work, nothing else matters. Surfaced by: Failure Recovery. |
+
+### source-ingestion-progress — Kafka Source Eventually Catches Up
+
+| | |
+|---|---|
+| **Type** | Liveness |
+| **Priority** | P2 — important but requires Kafka/Redpanda in topology |
+| **Property** | After creating a Kafka source, Materialize eventually ingests all available data and the source's write frontier advances past the data's timestamps. |
+| **Invariant** | `Sometimes(source_frontier_advances)`: the source's upper antichain must advance at least once during the test run, confirming data is flowing through the ingestion pipeline. |
+| **Antithesis Angle** | Create a Kafka source, produce messages, then inject network faults between Materialize and Redpanda. Verify the source eventually catches up when connectivity is restored. |
+| **Why It Matters** | Source ingestion is the primary data path. If it stalls, all downstream materialized views stop updating. Surfaced by: Product Context. |
+
+### mv-reflects-source-updates — Materialized Views Eventually Reflect Source Changes
+
+| | |
+|---|---|
+| **Type** | Liveness |
+| **Priority** | P1 — end-to-end user-visible correctness; Materialize's core value |
+| **Property** | After data is written to a source, materialized views that depend on that source eventually reflect the new data. |
+| **Invariant** | `Sometimes(mv_contains_new_data)`: after inserting data into a table or producing to a Kafka source, a SELECT on a dependent materialized view must eventually return the new data. |
+| **Antithesis Angle** | Insert data, inject faults (compute replica crash, storage reconnection), then verify the MV eventually shows the data. Antithesis explores whether faults during the incremental update pipeline cause permanent stalls. |
+| **Why It Matters** | This is the end-to-end user-visible correctness property. Materialize's value proposition is that MVs are always up-to-date. Surfaced by: Product Context. |
+
+### critical-reader-fence-linearization — Critical Reader Opaque Token Linearizes
+
+| | |
+|---|---|
+| **Type** | Safety |
+| **Priority** | P1 — incorrect fencing allows premature GC causing data loss |
+| **Property** | When two concurrent critical readers attempt compare_and_downgrade_since with mismatched opaque tokens, exactly one succeeds in updating the shard's since. No reader can re-observe an old opaque value after a SeqNo increment. |
+| **Invariant** | `Always`: concurrent compare_and_downgrade_since operations with different opaques result in exactly one mutation. The winner's opaque is durably recorded; the loser gets a mismatch. |
+| **Antithesis Angle** | Inject network delays between state check and state commit. Fail CaS operations after token comparison but before state write. Antithesis explores concurrent reader contention. |
+| **Why It Matters** | Critical readers control garbage collection boundaries. Incorrect fencing allows premature GC, which deletes data needed by active readers. Surfaced by: Data Integrity. |
diff --git a/antithesis/scratchbook/property-relationships.md b/antithesis/scratchbook/property-relationships.md
new file mode 100644
index 0000000000000..4df508acd6f03
--- /dev/null
+++ b/antithesis/scratchbook/property-relationships.md
@@ -0,0 +1,56 @@
+# Property Relationships
+
+## Cluster 1: Persist Layer Safety
+
+**Properties**: `persist-cas-monotonicity`, `tombstone-sealing-finality`, `idempotent-write-under-indeterminate`, `critical-reader-fence-linearization`
+
+These properties share the persist state machine code in `src/persist-client/src/internal/`. They all exercise the compare-and-swap loop in `Machine` and the `State` transitions. A bug in the CaS loop or state validation could violate multiple properties simultaneously.
+
+**Suspected dominance**: `persist-cas-monotonicity` is foundational — if SeqNo monotonicity breaks, all other persist properties likely break too. It dominates `tombstone-sealing-finality` and `critical-reader-fence-linearization`.
+
+## Cluster 2: Fencing and Split-Brain Prevention
+
+**Properties**: `epoch-fencing-prevents-split-brain`, `compute-replica-epoch-isolation`, `deployment-promotion-safety`
+
+These properties all use epoch-based fencing to prevent stale actors from mutating state. They share the pattern of "increment epoch on new incarnation, reject operations from old epoch." The catalog fencing (`epoch-fencing-prevents-split-brain`) and deployment fencing (`deployment-promotion-safety`) share code paths in `src/catalog/src/durable/persist.rs`.
+
+**Suspected dominance**: `epoch-fencing-prevents-split-brain` is the most fundamental — it protects the catalog. `deployment-promotion-safety` builds on it by also requiring caught-up checks before promotion. `compute-replica-epoch-isolation` is independent (different epoch mechanism for compute).
+
+## Cluster 3: Crash Recovery Pipeline
+
+**Properties**: `catalog-recovery-consistency`, `storage-command-replay-idempotent`, `fault-recovery-exercised`
+
+These properties test the recovery path after process crashes. `fault-recovery-exercised` is the end-to-end liveness check; `catalog-recovery-consistency` and `storage-command-replay-idempotent` test specific subsystems within recovery.
+
+**Suspected dominance**: `fault-recovery-exercised` is the weakest check (just "system comes back"). `catalog-recovery-consistency` is strictly stronger (catalog state is correct after recovery). If catalog recovery fails, the end-to-end recovery also fails.
+
+## Cluster 4: Consistency Model
+
+**Properties**: `strict-serializable-reads`, `mv-reflects-source-updates`, `source-ingestion-progress`
+
+These properties form a chain: source ingestion feeds materialized views, which serve reads. `strict-serializable-reads` depends on correct timestamp oracle behavior and frontier management. If `source-ingestion-progress` fails (data doesn't flow), `mv-reflects-source-updates` also fails, but `strict-serializable-reads` could still pass on stale but consistent data.
+
+**Suspected dominance**: `strict-serializable-reads` is independent of the liveness properties. `mv-reflects-source-updates` implies `source-ingestion-progress` (if MVs update, sources must have made progress).
+
+## Cluster 5: Coordinator Concurrency
+
+**Properties**: `group-commit-toctou-safety`, `peek-lifecycle-exactly-once`, `command-channel-ordering`
+
+These properties target different concurrency mechanisms within the coordinator and compute engine. They share the coordinator's event loop as the execution context but test independent subsystems (write path, read path, command dispatch).
+
+**No dominance**: These properties are independent of each other. A bug in peek handling doesn't imply a bug in group_commit or command channels.
+
+## Cluster 6: Deployment Lifecycle
+
+**Properties**: `deployment-promotion-safety`, `deployment-lag-detection`
+
+Both test the 0DT deployment pipeline. `deployment-lag-detection` is a prerequisite for `deployment-promotion-safety` — if lag detection fails, promotion may proceed unsafely.
+
+**Suspected dominance**: `deployment-promotion-safety` is stronger — it requires both lag detection and correct fencing. `deployment-lag-detection` is a liveness check on a subsystem of the promotion pipeline.
+
+## Cross-Cluster Connections
+
+- `epoch-fencing-prevents-split-brain` (Cluster 2) protects `catalog-recovery-consistency` (Cluster 3) — fencing ensures only one writer during recovery
+- `persist-cas-monotonicity` (Cluster 1) underpins `catalog-recovery-consistency` (Cluster 3) — catalog is stored in persist, so CaS correctness is a prerequisite
+- `strict-serializable-reads` (Cluster 4) depends on `epoch-fencing-prevents-split-brain` (Cluster 2) — split-brain would allow inconsistent timestamp assignments
+- `idempotent-write-under-indeterminate` (Cluster 1) protects `storage-command-replay-idempotent` (Cluster 3) — storage ingestion uses persist writes, so idempotency matters for both
diff --git a/antithesis/scratchbook/sut-analysis.md b/antithesis/scratchbook/sut-analysis.md
new file mode 100644
index 0000000000000..a0ff7561eed5e
--- /dev/null
+++ b/antithesis/scratchbook/sut-analysis.md
@@ -0,0 +1,217 @@
+# SUT Analysis: Materialize
+
+## System Overview
+
+Materialize is a real-time data integration platform and streaming SQL database written primarily in Rust. It reads change data from PostgreSQL (logical replication), MySQL, Kafka/Redpanda, and webhooks, then maintains materialized views incrementally using differential dataflow. It speaks the PostgreSQL wire protocol, so any psql client or Postgres driver can connect.
+
+The system claims **strict serializability** for interactive queries and provides **incremental, consistent, low-latency** results over streaming data. It does not offer approximate answers or eventual consistency.
+
+## Architecture
+
+### Three-Layer Design
+
+Materialize is organized into three logical layers that run as separate processes:
+
+**1. Adapter Layer (environmentd)**
+- Main coordinator process (`src/environmentd/`)
+- Hosts pgwire server (port 6875), HTTP API (6878), and internal coordination endpoints
+- Parses SQL, plans queries, manages sessions, enforces consistency
+- Contains the Catalog (schema metadata) in memory, persisted to durable storage
+- Runs a **single-threaded async event loop** on a Tokio runtime for coordination
+- Multiplexes ComputeController and StorageController to manage downstream clusters
+
+**2. Compute Layer (clusterd - compute)**
+- Worker processes running Timely Dataflow engines (`src/compute*/`, `src/clusterd/`)
+- Executes views, maintains materialized views, performs joins
+- Stateless — can be rehydrated from storage on crash
+- Multiple replicas provide active replication for HA
+- Workers parallelize via native OS threads (one per Timely worker)
+
+**3. Storage Layer (clusterd - storage)**
+- Worker processes for data ingestion (`src/storage*/`)
+- Reads from external sources (Kafka, Postgres CDC, MySQL, webhooks)
+- Reclocks source timestamps to Materialize's internal timeline
+- Writes to Persist (blob storage + consensus) for durability
+- Manages sinks (Kafka sinks with exactly-once semantics)
+
+### Communication Protocols
+
+| Path | Protocol | Details |
+|------|----------|---------|
+| Client -> Balancerd -> Environmentd | pgwire (PostgreSQL wire protocol) | TLS, port 6875 |
+| Environmentd -> Clusterd | CTP (Cluster Transport Protocol) | Length-prefixed bincode over TCP/UDS, ports 2100-2101 |
+| Clusterd workers <-> workers | Timely mesh | Generation-epoch protocol, ports 2102-2103 |
+| Clusterd -> Persist | HTTP/S3 API | Blob storage writes + consensus CaS |
+| Environmentd -> Persist | Direct | Catalog stored in persist shard |
+| Clusterd -> Environmentd | Persist PubSub | HTTP on port 6879, state change subscriptions |
+
+### Key Entrypoints
+
+- `src/environmentd/src/environmentd/main.rs` — main server startup
+- `src/clusterd/src/bin/clusterd.rs` — compute/storage worker startup
+- `src/balancerd/` — stateless connection router
+- `src/pgwire/` — PostgreSQL wire protocol implementation
+- `src/adapter/` — SQL planning, coordination, session management
+
+## State Management
+
+### Five Tiers of State
+
+1. **Catalog metadata** — table/view/source/sink definitions, roles, clusters
+   - Stored in a persist shard (blob + consensus)
+   - Reconstructed into `CatalogState` in-memory on startup
+   - Mutated via `catalog_transact()` with atomic `TransactionBatch` writes
+
+2. **Source/ingestion data** — rows from Kafka, Postgres CDC, MySQL, webhooks
+   - Written to persist shards by storage workers
+   - Keyed by Materialize-assigned timestamps (reclocked from source timestamps)
+
+3. **Materialized view data** — output of incrementally-maintained computations
+   - Written to persist shards by compute workers
+   - Stored as columnar batches in blob storage
+
+4. **Timestamps/frontiers** — read/write boundaries tracking collection completeness
+   - `since` (read frontier): minimum time a collection can be read
+   - `upper` (write frontier): maximum time written
+   - Tracked as `Antichain<Timestamp>` lattice values
+   - Global timestamp oracle provides causally-consistent read times
+
+5. **In-flight state** — active dataflow computations, pending peeks, session state
+   - Held in memory by compute/storage workers and the coordinator
+   - Lost on crash, recovered via replay from persist
+
+### Persistence Architecture
+
+**Blob Storage (S3/MinIO/Azure/Postgres-backed):**
+- Immutable data batches (columnar Parquet/Arrow format)
+- Rollups (periodic snapshots of shard state for fast recovery)
+
+**Consensus (CockroachDB/PostgreSQL/FoundationDB):**
+- Shard metadata: `since`, `upper`, spine structure
+- Writer/reader leases with heartbeats
+- Sequence numbers (`SeqNo`) for version linearity
+- Catalog mutations as `StateUpdate` events
+
+**Atomic Writes:**
+- Compare-and-append via `Machine<K,V,T,D>`: writers must match expected `upper` antichain
+- Idempotency tokens prevent duplicates on retries
+- Fencing via `FenceToken` (deploy generation + epoch) prevents split-brain
+
+## Concurrency Model
+
+### Coordinator (environmentd)
+- **Single-threaded event loop** on Tokio runtime
+- Processes commands via `tokio::select!` from multiple MPSC channels
+- Per-object write locks (`Arc<tokio::sync::Mutex<()>>`) serialize DDL to same object
+- Catalog shared as `Arc<Catalog>` for read-only off-thread access; mutations are serialized through the event loop
+- Timeline state (`global_timelines`) accessed serially within event loop
+
+### Compute/Storage Workers (clusterd)
+- One native OS thread per Timely worker (configurable count)
+- Workers coordinate via Timely's internal barriers and distributed snapshot semantics
+- Commands received via MPSC channels from controllers
+- Worker 0 broadcasts commands to other workers per Timely conventions
+
+### Synchronization Primitives
+- `Arc<tokio::sync::Mutex>` for per-object write locks
+- `mpsc::UnboundedSender/Receiver` for coordinator internal messaging
+- `watch::Sender/Receiver` for per-connection cancellation
+- `Arc<Mutex>` (std) for low-contention shared state (metrics, log writers)
+- Timely's own worker-to-worker channels for dataflow coordination
+
+## Safety and Liveness Guarantees
+
+### Claimed Safety Guarantees
+
+1. **Strict Serializability** (design doc 20220516): "Transactions in Materialize are strictly serializable with respect to operations inside of Materialize" (SELECT, INSERT, UPDATE, DELETE). All timestamp transitions made durable before response issued.
+
+2. **Definiteness** (design doc 20210831): Collections are "definite" — all uses yield exactly the same time-varying data at each logical time. Data definite for times in range `[since, upper)`.
+
+3. **Exactly-Once Kafka Sinks** (design doc 20200520): Transactional consistency for Kafka sink output with consistency topic.
+
+4. **Acknowledged Writes Survive Failures**: All data written to persist (blob + consensus) before acknowledgment. Catalog mutations durable before response.
+
+5. **Epoch-Based Leader Fencing**: New coordinators increment epoch on startup; old coordinators' transactions fail. Prevents split-brain after coordinator crash.
+
+### Claimed Liveness Guarantees
+
+1. **Persist Reader/Writer Liveness**: "At least one reader/writer can always make progress" even when peers are paused or restarted.
+
+2. **Collection Progress**: "The collection upper advances so long as one writer can make progress."
+
+3. **Active Replication Recovery**: "Masking of recovery delay can only be guaranteed when compute controller can reach at least one non-faulty replica."
+
+4. **Automatic Failover**: Compute replicas automatically rehydrate from storage on crash. Multiple replicas mask recovery latency.
+
+### Limitations
+- HA (multi-active replication) is cloud-only; self-managed has single coordinator
+- SUBSCRIBE, sinks, and `AS OF` queries may circumvent strict serializability
+- No byzantine fault tolerance; system assumes honest coordinator
+- Single coordinator bottleneck for timestamp oracle
+
+## Failure and Degradation Modes
+
+### Failure-Prone Areas
+
+1. **Startup/Configuration**: Many `expect()`/`unwrap()` calls in startup path — misconfiguration causes immediate crash rather than degraded operation.
+
+2. **Replica Reconnection**: Infinite retry with exponential backoff (capped at 1s). Can cause minutes-long recovery latency during transient failures. No circuit breakers.
+
+3. **Persist Layer Failures**: No circuit breaker for blob/consensus unavailability. System retries with backoff, creating backpressure rather than failing fast. Bounded retry loops (3-5 attempts) for some storage management operations.
+
+4. **0DT Deployment**: Preflight checks with configurable timeout. Can either panic or proceed degraded if standby doesn't catch up. Read-only promotion before full read-write.
+
+### Health Checking
+- `/health/liveness` — always returns 200 (process is alive)
+- `/health/ready` — returns 503 until adapter client available; optional `wait=true` blocks
+- `curl localhost:6878/api/readyz` used in Docker healthchecks
+
+### Graceful Degradation
+- Compute replicas: partial replica failure tolerated; system serves from remaining replicas
+- 0DT standby boots read-only, promotes after catching up
+- Feature flags return 503 rather than crashing when disabled
+- No graceful degradation for metadata store (CRDB/PG) unavailability — system halts
+
+## External Dependencies
+
+| Dependency | Role | Criticality |
+|-----------|------|-------------|
+| CockroachDB / PostgreSQL / FoundationDB | Consensus for persist + catalog | CRITICAL — system halts without it |
+| S3 / MinIO / Azure Blob | Blob storage for persist data | CRITICAL — writes fail without it |
+| Kafka / Redpanda | Stream source ingestion | CRITICAL for streaming workflows |
+| PostgreSQL (source) | CDC replication source | CRITICAL for CDC workflows |
+| MySQL (source) | CDC replication source | Optional |
+| Schema Registry | Avro/Protobuf schema management | Required for typed Kafka sources |
+| Balancerd | pgwire connection routing | CRITICAL for multi-tenant |
+
+## Existing Test Strategy
+
+### mzcompose Framework (`misc/python/materialize/mzcompose/`)
+- Meta-test framework generating Docker Compose files dynamically
+- `Composition` class loads `mzcompose.py` files, discovers `workflow_*()` functions
+- Pre-built service classes: `Materialized`, `Clusterd`, `Kafka`, `Redpanda`, `Postgres`, `CockroachOrPostgresMetadata`, `Minio`, `Toxiproxy`, etc.
+- Granular lifecycle control: `c.up()`, `c.kill()`, `c.stop()`, `c.pause()`, `c.override()`
+- Generates YAML on-demand, passes to `docker compose` via file descriptors
+- Health-check driven startup with configurable intervals
+
+### Test Frameworks
+1. **testdrive (.td)** — declarative SQL test language with timeout assertions and version-conditional tests
+2. **sqllogictest (.slt)** — standard SQL logic test format for correctness
+3. **Platform Checks** — "write once, run everywhere" tests across upgrade/restart/failure scenarios
+4. **parallel-workload** — random concurrent SQL operations stress testing
+
+### Failure Testing Coverage
+**Tested**: clusterd crashes/recovery, CockroachDB restarts, network faults (Toxiproxy), failpoint injection, statement timeouts, source/sink resilience, 0DT deployments
+
+**Not tested at scale**: coordinated multi-node cascading failures, deterministic replay of timing-sensitive bugs, property-based invariant testing under adversarial fault injection — this is where Antithesis adds value
+
+## Assumptions
+- The mzcompose-based Docker Compose approach is the right integration path (vs. K8s)
+- The existing Antithesis K8s-based experiment scripts represent an older approach to be superseded
+- Materialize's self-managed/community edition (single-node) is the target, not the cloud multi-tenant version
+
+## Open Questions
+- Which mzcompose test suite(s) provide the best starting workload? (platform-checks, parallel-workload, or custom)
+- What is the preferred metadata store for Antithesis testing — CockroachDB or PostgreSQL?
+- Should we test with multiple compute replicas or single replica?
+- Are there specific failure scenarios the Materialize team wants prioritized?
diff --git a/test/antithesis/export-compose.py b/test/antithesis/export-compose.py
new file mode 100644
index 0000000000000..61334a63cc461
--- /dev/null
+++ b/test/antithesis/export-compose.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+"""Export the resolved docker-compose YAML for the Antithesis composition.
+
+Loads the mzcompose composition and dumps the compose dict to stdout as
+YAML — without building any images or requiring a running Docker daemon.
+
+mzbuild references are replaced with public images where possible,
+or local tags for images that must be built (e.g. the workload).
+
+Usage:
+    bin/pyactivate test/antithesis/export-compose.py > antithesis/config/docker-compose.yaml
+"""
+
+import sys
+from pathlib import Path
+
+import yaml
+
+from materialize.mzbuild import Repository
+from materialize.mzcompose.composition import Composition
+
+# Map mzbuild names → image references for the Antithesis compose.
+# Public images for infra; local build tag for the workload.
+MZBUILD_TO_IMAGE = {
+    "materialized": "materialize/materialized:latest",
+    "postgres": "postgres:17.7",
+    "minio": "minio/minio:latest",
+    "antithesis-workload": "materialize-workload:latest",
+}
+
+repo = Repository(Path("."), arch="x86_64")
+c = Composition(repo, "antithesis", munge_services=False)
+
+for name, svc in c.compose["services"].items():
+    svc["platform"] = "linux/amd64"
+
+    if "mzbuild" in svc:
+        mzbuild_name = svc.pop("mzbuild")
+        if mzbuild_name not in MZBUILD_TO_IMAGE:
+            print(
+                f"warning: no image mapping for mzbuild {mzbuild_name!r}, "
+                f"using {mzbuild_name}:latest",
+                file=sys.stderr,
+            )
+            svc["image"] = f"{mzbuild_name}:latest"
+        else:
+            svc["image"] = MZBUILD_TO_IMAGE[mzbuild_name]
+
+    # Vanilla postgres needs trust auth to match the mzbuild image behavior
+    # (materialized connects as root with no password)
+    if svc.get("image", "").startswith("postgres:"):
+        svc.setdefault("environment", []).append("POSTGRES_HOST_AUTH_METHOD=trust")
+
+    # Drop mzcompose-only keys that docker/podman compose doesn't understand
+    for key in ["propagate_uid_gid", "allow_host_ports", "publish"]:
+        svc.pop(key, None)
+
+yaml.dump(c.compose, sys.stdout, default_flow_style=False, sort_keys=False)
diff --git a/test/antithesis/mzcompose.py b/test/antithesis/mzcompose.py
new file mode 100644
index 0000000000000..d84b0f0108bd5
--- /dev/null
+++ b/test/antithesis/mzcompose.py
@@ -0,0 +1,88 @@
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""
+Antithesis test composition for Materialize.
+
+Defines the minimal topology needed to exercise Materialize under Antithesis:
+  - postgres-metadata: consensus/catalog store
+  - minio: S3-compatible blob storage for persist
+  - redpanda: Kafka-compatible broker for source ingestion
+  - materialized: the SUT (embedded clusterd mode)
+  - workload: Python test driver with Antithesis SDK
+
+Usage:
+  bin/mzcompose --find antithesis run default        # bring up the cluster
+  bin/mzcompose --find antithesis run export-compose  # dump compose YAML
+"""
+
+import sys
+
+import yaml
+
+from materialize.mzcompose.composition import Composition, WorkflowArgumentParser
+from materialize.mzcompose.service import Service, ServiceConfig
+from materialize.mzcompose.services.materialized import Materialized
+from materialize.mzcompose.services.minio import Minio
+from materialize.mzcompose.services.postgres import PostgresMetadata
+from materialize.mzcompose.services.redpanda import Redpanda
+
+
+class Workload(Service):
+    """Antithesis workload client — Python test driver."""
+
+    def __init__(self) -> None:
+        config: ServiceConfig = {
+            "mzbuild": "antithesis-workload",
+            "depends_on": {
+                "materialized": {"condition": "service_healthy"},
+                "redpanda": {"condition": "service_healthy"},
+            },
+            "environment": [
+                "PGHOST=materialized",
+                "PGPORT=6875",
+                "PGUSER=materialize",
+                "KAFKA_BROKER=kafka:9092",
+                "SCHEMA_REGISTRY_URL=http://schema-registry:8081",
+            ],
+        }
+        super().__init__(name="workload", config=config)
+
+
+SERVICES = [
+    PostgresMetadata(),
+    Minio(setup_materialize=True),
+    Redpanda(auto_create_topics=True),
+    Materialized(
+        external_blob_store=True,
+        external_metadata_store=True,
+        metadata_store="postgres-metadata",
+        unsafe_mode=True,
+        soft_assertions=True,
+        sanity_restart=False,
+    ),
+    Workload(),
+]
+
+
+def workflow_default(c: Composition) -> None:
+    """Bring up the Antithesis test cluster."""
+    c.up("postgres-metadata", "minio", "redpanda")
+    c.up("materialized")
+    c.up("workload")
+
+
+def workflow_export_compose(c: Composition) -> None:
+    """Export the resolved docker-compose YAML to stdout.
+
+    Usage:
+      bin/mzcompose --find antithesis run export-compose > antithesis/config/docker-compose.yaml
+    """
+    # c.compose is the fully-resolved compose dict (mzbuild: replaced with image:)
+    yaml.dump(c.compose, sys.stdout, default_flow_style=False, sort_keys=False)
diff --git a/test/antithesis/workload/Dockerfile b/test/antithesis/workload/Dockerfile
new file mode 100644
index 0000000000000..804cb1b3009ec
--- /dev/null
+++ b/test/antithesis/workload/Dockerfile
@@ -0,0 +1,34 @@
+# Antithesis workload client for Materialize.
+#
+# Python-based test driver that connects to materialized via pgwire,
+# produces Kafka messages, and emits Antithesis assertions.
+
+FROM python:3.12-slim-bookworm
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    curl \
+    postgresql-client \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN pip install --no-cache-dir \
+    psycopg[binary]==3.2.9 \
+    confluent-kafka==2.8.0 \
+    antithesis==0.2.0
+
+# setup-complete script
+COPY setup-complete.sh /usr/local/bin/setup-complete.sh
+RUN chmod +x /usr/local/bin/setup-complete.sh
+
+# Test template directory — populated by antithesis-workload skill later
+RUN mkdir -p /opt/antithesis/test/v1/materialize
+
+# Catalog directory for Python assertion cataloging
+RUN mkdir -p /opt/antithesis/catalog
+
+# Copy test templates and entrypoint
+COPY test/ /opt/antithesis/test/v1/materialize/
+COPY workload-entrypoint.sh /usr/local/bin/workload-entrypoint.sh
+RUN chmod +x /usr/local/bin/workload-entrypoint.sh
+RUN chmod +x /opt/antithesis/test/v1/materialize/* 2>/dev/null || true
+
+ENTRYPOINT ["/usr/local/bin/workload-entrypoint.sh"]
diff --git a/test/antithesis/workload/mzbuild.yml b/test/antithesis/workload/mzbuild.yml
new file mode 100644
index 0000000000000..beed6bf84e93b
--- /dev/null
+++ b/test/antithesis/workload/mzbuild.yml
@@ -0,0 +1 @@
+name: antithesis-workload
diff --git a/test/antithesis/workload/setup-complete.sh b/test/antithesis/workload/setup-complete.sh
new file mode 100755
index 0000000000000..59384ae9ba2b4
--- /dev/null
+++ b/test/antithesis/workload/setup-complete.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Run this script to inform Antithesis that it can start running Test Composer
+# Commands. You can also use the Antithesis SDK to emit setup-complete from your
+# system if that is easier.
+#
+# Antithesis sets the `ANTITHESIS_OUTPUT_DIR` environment variable
+# automatically. This script is setup to emit `setup_complete` to the
+# `sdk.jsonl` file in that directory.
+
+OUTPUT_PATH="/tmp/antithesis_sdk.jsonl"
+if [[ -n "${ANTITHESIS_OUTPUT_DIR:-}" ]]; then
+  OUTPUT_PATH="${ANTITHESIS_OUTPUT_DIR}/sdk.jsonl"
+  echo "Running in Antithesis, emitting setup_complete to ${OUTPUT_PATH}"
+elif [[ -n "${ANTITHESIS_SDK_LOCAL_OUTPUT:-}" ]]; then
+  OUTPUT_PATH="${ANTITHESIS_SDK_LOCAL_OUTPUT}"
+  echo "Antithesis SDK local output override detected, emitting setup_complete to ${OUTPUT_PATH}"
+fi
+
+mkdir -p $(dirname "$OUTPUT_PATH")
+echo '{"antithesis_setup":{"status":"complete","details":{"message":"ready to go"}}}' >> "${OUTPUT_PATH}"
diff --git a/test/antithesis/workload/test/anytime_health_check.sh b/test/antithesis/workload/test/anytime_health_check.sh
new file mode 100755
index 0000000000000..f3feefe5a402e
--- /dev/null
+++ b/test/antithesis/workload/test/anytime_health_check.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Basic health check — verifies materialized is responding to SQL.
+# This is a minimal placeholder; the antithesis-workload skill will add
+# real test commands with property assertions.
+
+PGHOST="${PGHOST:-materialized}"
+PGPORT="${PGPORT:-6875}"
+PGUSER="${PGUSER:-materialize}"
+
+result=$(psql -h "$PGHOST" -p "$PGPORT" -U "$PGUSER" -tAc "SELECT 1" 2>&1)
+if [ "$result" = "1" ]; then
+    echo "Health check passed"
+    exit 0
+else
+    echo "Health check failed: $result"
+    exit 1
+fi
diff --git a/test/antithesis/workload/workload-entrypoint.sh b/test/antithesis/workload/workload-entrypoint.sh
new file mode 100755
index 0000000000000..0f5b012c3ad9e
--- /dev/null
+++ b/test/antithesis/workload/workload-entrypoint.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Wait for materialized to be ready before signaling setup_complete.
+echo "Waiting for materialized to become healthy..."
+until curl -sf http://materialized:6878/api/readyz > /dev/null 2>&1; do
+    sleep 1
+done
+echo "materialized is healthy."
+
+# Emit setup_complete — Antithesis begins test commands after this.
+/usr/local/bin/setup-complete.sh
+
+# Sleep forever — Test Composer runs the test commands, not this entrypoint.
+echo "Setup complete. Sleeping while Test Composer runs commands."
+exec sleep infinity

From 127f67ec4b598c676c49b2ecd321b35d9f3e42c5 Mon Sep 17 00:00:00 2001
From: Mitch Wagner <mitch.wagner@antithesis.com>
Date: Wed, 6 May 2026 17:47:03 -0400
Subject: [PATCH 02/65] feat: tweaks for basic_test

---
 antithesis/Makefile                           | 16 ++++--
 antithesis/config/Dockerfile                  |  2 +
 antithesis/config/docker-compose.yaml         | 18 ++++---
 test/antithesis/export-compose.py             | 50 +++++++++++++++++--
 .../workload/test/anytime_health_check.sh     |  2 +-
 5 files changed, 73 insertions(+), 15 deletions(-)
 create mode 100644 antithesis/config/Dockerfile

diff --git a/antithesis/Makefile b/antithesis/Makefile
index d29e795d22be7..0afa1cd1f3335 100644
--- a/antithesis/Makefile
+++ b/antithesis/Makefile
@@ -39,17 +39,23 @@ export-compose:
 # ---------------------------------------------------------------------------
 # Build — build images that don't have public equivalents.
 # ---------------------------------------------------------------------------
-LOCAL_IMAGES  := workload
+LOCAL_IMAGES  := workload config
 BUILD_TARGETS := $(LOCAL_IMAGES:%=build-%)
 
 .PHONY: build $(BUILD_TARGETS)
-build: $(BUILD_TARGETS)
+build: export-compose $(BUILD_TARGETS)
 
-$(BUILD_TARGETS): build-%:
+build-workload:
 	$(RUNTIME) build \
 	  --platform linux/amd64 \
-	  -t $(PROJECT)-$*:latest \
-	  $(REPO_ROOT)/test/antithesis/$*
+	  -t $(PROJECT)-workload:latest \
+	  $(REPO_ROOT)/test/antithesis/workload
+
+build-config: export-compose
+	$(RUNTIME) build \
+	  --platform linux/amd64 \
+	  -t $(PROJECT)-config:latest \
+	  config
 
 # ---------------------------------------------------------------------------
 # Up / Down
diff --git a/antithesis/config/Dockerfile b/antithesis/config/Dockerfile
new file mode 100644
index 0000000000000..fb59d4a2bd588
--- /dev/null
+++ b/antithesis/config/Dockerfile
@@ -0,0 +1,2 @@
+FROM scratch
+COPY docker-compose.yaml /
diff --git a/antithesis/config/docker-compose.yaml b/antithesis/config/docker-compose.yaml
index 6eb68d6f7e789..b85c1e4d72299 100644
--- a/antithesis/config/docker-compose.yaml
+++ b/antithesis/config/docker-compose.yaml
@@ -15,7 +15,6 @@ services:
     environment:
     - POSTGRESDB=postgres
     - POSTGRES_PASSWORD=postgres
-    - LD_PRELOAD=libeatmydata.so
     - PGPORT=26257
     - POSTGRES_HOST_AUTH_METHOD=trust
     healthcheck:
@@ -27,10 +26,20 @@ services:
       interval: 1s
       start_period: 30s
     restart: 'no'
-    volumes:
-    - ../../misc/postgres/setup_materialize.sql:/docker-entrypoint-initdb.d/z_setup_materialize.sql
     platform: linux/amd64
     image: postgres:17.7
+    entrypoint:
+    - sh
+    - -c
+    - 'echo "CREATE ROLE root WITH LOGIN PASSWORD ''root'';CREATE DATABASE root;GRANT
+      ALL PRIVILEGES ON DATABASE root TO root;\c root;CREATE SCHEMA IF NOT EXISTS
+      consensus AUTHORIZATION root;CREATE SCHEMA IF NOT EXISTS adapter AUTHORIZATION
+      root;CREATE SCHEMA IF NOT EXISTS storage AUTHORIZATION root;CREATE SCHEMA IF
+      NOT EXISTS tsoracle AUTHORIZATION root;GRANT ALL PRIVILEGES ON SCHEMA public
+      TO root;" > /docker-entrypoint-initdb.d/z_setup_materialize.sql
+
+      exec docker-entrypoint.sh "$$@"'
+    - --
   minio:
     entrypoint:
     - sh
@@ -129,7 +138,6 @@ services:
     - MZ_INTERNAL_PERSIST_PUBSUB_LISTEN_ADDR=0.0.0.0:6879
     - MZ_PERSIST_PUBSUB_URL=http://127.0.0.1:6879
     - MZ_AWS_CONNECTION_ROLE_ARN=arn:aws:iam::123456789000:role/MaterializeConnection
-    - MZ_EXTERNAL_LOGIN_PASSWORD_MZ_SYSTEM=password
     - MZ_AWS_EXTERNAL_ID_PREFIX=eb5cb59b-e2fe-41f3-87ca-d2176a495345
     - MZ_CATALOG_STORE=persist
     - MZ_LOG_FILTER
@@ -269,9 +277,7 @@ services:
     - MZ_NO_BUILTIN_POSTGRES=1
     - MZ_NO_BUILTIN_COCKROACH=1
     - MZ_ADAPTER_STASH_URL=postgres://root@postgres-metadata:26257?options=--search_path=adapter
-    - MZ_LISTENERS_CONFIG_PATH=/listeners_config
     volumes:
-    - /home/mitch/src/customer/customer-materialize/materialize/src/materialized/ci/listener_configs/testdrive.json:/listeners_config
     - mzdata:/mzdata
     - mydata:/var/lib/mysql-files
     - tmp:/share/tmp
diff --git a/test/antithesis/export-compose.py b/test/antithesis/export-compose.py
index 61334a63cc461..5b487a5485bc2 100644
--- a/test/antithesis/export-compose.py
+++ b/test/antithesis/export-compose.py
@@ -46,10 +46,54 @@
         else:
             svc["image"] = MZBUILD_TO_IMAGE[mzbuild_name]
 
-    # Vanilla postgres needs trust auth to match the mzbuild image behavior
-    # (materialized connects as root with no password)
+    # Fixups for vanilla postgres (the mzbuild image has eatmydata, custom
+    # pg_hba.conf, and baked-in init SQL — none of which exist in the public image).
     if svc.get("image", "").startswith("postgres:"):
-        svc.setdefault("environment", []).append("POSTGRES_HOST_AUTH_METHOD=trust")
+        env = svc.get("environment", [])
+        # Remove eatmydata — not installed in vanilla postgres
+        env[:] = [e for e in env if not e.startswith("LD_PRELOAD=")]
+        # Trust auth so materialized can connect as root without a password
+        env.append("POSTGRES_HOST_AUTH_METHOD=trust")
+        # Remove host bind-mount for setup SQL — won't exist in Antithesis.
+        # Instead, inline the init SQL that creates the schemas materialized needs.
+        vols = svc.get("volumes", [])
+        vols[:] = [v for v in vols if "setup_materialize.sql" not in v]
+        if not vols:
+            del svc["volumes"]
+        # Inline the init SQL as a script volume
+        init_sql = (
+            "CREATE ROLE root WITH LOGIN PASSWORD 'root';"
+            "CREATE DATABASE root;"
+            "GRANT ALL PRIVILEGES ON DATABASE root TO root;"
+            r"\c root;"
+            "CREATE SCHEMA IF NOT EXISTS consensus AUTHORIZATION root;"
+            "CREATE SCHEMA IF NOT EXISTS adapter AUTHORIZATION root;"
+            "CREATE SCHEMA IF NOT EXISTS storage AUTHORIZATION root;"
+            "CREATE SCHEMA IF NOT EXISTS tsoracle AUTHORIZATION root;"
+            "GRANT ALL PRIVILEGES ON SCHEMA public TO root;"
+        )
+        svc.setdefault("entrypoint", [])
+        svc["entrypoint"] = ["sh", "-c", f"""
+echo "{init_sql}" > /docker-entrypoint-initdb.d/z_setup_materialize.sql
+exec docker-entrypoint.sh "$$@"
+""".strip(), "--"]
+
+    # Strip host bind-mounts — they won't resolve in Antithesis
+    if "volumes" in svc:
+        svc["volumes"] = [
+            v for v in svc["volumes"]
+            if not isinstance(v, str) or ":" not in v or not v.split(":")[0].startswith("/")
+        ]
+        if not svc["volumes"]:
+            del svc["volumes"]
+
+    # Remove env vars that point at host-only paths (the Docker image
+    # entrypoint provides sensible defaults when these are unset)
+    if "environment" in svc:
+        svc["environment"] = [
+            e for e in svc["environment"]
+            if not e.startswith(("MZ_LISTENERS_CONFIG_PATH=", "MZ_EXTERNAL_LOGIN_PASSWORD_"))
+        ]
 
     # Drop mzcompose-only keys that docker/podman compose doesn't understand
     for key in ["propagate_uid_gid", "allow_host_ports", "publish"]:
diff --git a/test/antithesis/workload/test/anytime_health_check.sh b/test/antithesis/workload/test/anytime_health_check.sh
index f3feefe5a402e..f7d743ebc4cd7 100755
--- a/test/antithesis/workload/test/anytime_health_check.sh
+++ b/test/antithesis/workload/test/anytime_health_check.sh
@@ -9,7 +9,7 @@ PGHOST="${PGHOST:-materialized}"
 PGPORT="${PGPORT:-6875}"
 PGUSER="${PGUSER:-materialize}"
 
-result=$(psql -h "$PGHOST" -p "$PGPORT" -U "$PGUSER" -tAc "SELECT 1" 2>&1)
+result=$(psql -h "$PGHOST" -p "$PGPORT" -U "$PGUSER" -tAc "SELECT 1" 2>/dev/null)
 if [ "$result" = "1" ]; then
     echo "Health check passed"
     exit 0

From 8323b8a1e260109f285e81ff583137f43436d793 Mon Sep 17 00:00:00 2001
From: Mitch Wagner <mitch.wagner@antithesis.com>
Date: Thu, 7 May 2026 15:04:42 -0400
Subject: [PATCH 03/65] feat: working instrumentation

---
 antithesis/Makefile                    | 22 ++++++++++--
 antithesis/config/docker-compose.yaml  |  2 +-
 bin/ci-builder                         | 47 +++++++++++++++++++-------
 ci/builder/Dockerfile                  |  5 +++
 misc/python/materialize/mzbuild.py     | 46 +++++++++++++++++--------
 misc/python/materialize/rustc_flags.py | 14 ++++++++
 src/materialized/ci/Dockerfile         | 11 ++++++
 test/antithesis/export-compose.py      |  2 +-
 8 files changed, 117 insertions(+), 32 deletions(-)

diff --git a/antithesis/Makefile b/antithesis/Makefile
index 0afa1cd1f3335..25bf6408cf927 100644
--- a/antithesis/Makefile
+++ b/antithesis/Makefile
@@ -21,6 +21,9 @@ endif
 ifeq ($(RUNTIME),none)
   $(error neither podman nor docker found in PATH; set RUNTIME=docker or install podman)
 endif
+ifeq ($(RUNTIME),podman)
+  export MZ_DEV_CI_BUILDER_RUNTIME := podman
+endif
 
 COMPOSE := $(RUNTIME) compose -p $(PROJECT) -f config/docker-compose.yaml
 PSQL    := $(COMPOSE) exec materialized psql -h localhost -p 6875 -U materialize
@@ -39,12 +42,26 @@ export-compose:
 # ---------------------------------------------------------------------------
 # Build — build images that don't have public equivalents.
 # ---------------------------------------------------------------------------
-LOCAL_IMAGES  := workload config
+LOCAL_IMAGES  := materialized workload config
 BUILD_TARGETS := $(LOCAL_IMAGES:%=build-%)
 
-.PHONY: build $(BUILD_TARGETS)
+.PHONY: build $(BUILD_TARGETS) build-builder
 build: export-compose $(BUILD_TARGETS)
 
+build-builder:
+	cd $(REPO_ROOT) && $(REPO_ROOT)/bin/ci-builder build stable --load
+	@tag=$$(cd $(REPO_ROOT) && $(REPO_ROOT)/bin/ci-builder tag stable); \
+	  echo "Tagging materialize/ci-builder:$$tag -> $(PROJECT)-builder:latest"; \
+	  $(RUNTIME) tag "materialize/ci-builder:$$tag" $(PROJECT)-builder:latest
+
+build-materialized: build-builder
+	cd $(REPO_ROOT) && $(REPO_ROOT)/bin/mzimage acquire materialized --antithesis
+	@# Tag the mzbuild output to the name the compose file expects
+	@img=$$($(RUNTIME) images --format '{{.Repository}}:{{.Tag}}' \
+	  | grep 'materialized:mzbuild-' | head -1); \
+	  echo "Tagging $$img -> $(PROJECT)-materialized:latest"; \
+	  $(RUNTIME) tag "$$img" $(PROJECT)-materialized:latest
+
 build-workload:
 	$(RUNTIME) build \
 	  --platform linux/amd64 \
@@ -85,6 +102,7 @@ test:
 push:
 	@$(RUNTIME) images --format '{{.Repository}}:{{.Tag}}' \
 	  | grep ':latest$$' | grep '^\(localhost/\)\?$(PROJECT)-' \
+	  | grep -v '$(PROJECT)-builder' \
 	  | while read item; do \
 	      nametag="$${item#localhost/}"; \
 	      name="$${nametag%:*}"; \
diff --git a/antithesis/config/docker-compose.yaml b/antithesis/config/docker-compose.yaml
index b85c1e4d72299..004fc60b245d9 100644
--- a/antithesis/config/docker-compose.yaml
+++ b/antithesis/config/docker-compose.yaml
@@ -294,7 +294,7 @@ services:
       start_period: 600s
     stop_grace_period: 120s
     platform: linux/amd64
-    image: materialize/materialized:latest
+    image: materialize-materialized:latest
   workload:
     depends_on:
       materialized:
diff --git a/bin/ci-builder b/bin/ci-builder
index 066bf273130a9..0e81c806063d8 100755
--- a/bin/ci-builder
+++ b/bin/ci-builder
@@ -18,6 +18,9 @@ set -euo pipefail
 
 NIGHTLY_RUST_DATE=2026-05-06
 
+# Allow overriding the container runtime (e.g. MZ_DEV_CI_BUILDER_RUNTIME=podman).
+DOCKER="${MZ_DEV_CI_BUILDER_RUNTIME:-docker}"
+
 workdir=$(pwd)
 cd "$(dirname "$0")/.."
 
@@ -128,10 +131,14 @@ gid=$(id -g)
 [[ "$gid" -lt 500 ]] && gid=$uid
 
 build() {
+    local cache_args=()
+    if [[ "$DOCKER" != "podman" ]]; then
+        cache_args+=(--cache-from=materialize/ci-builder:"$cache_tag")
+        cache_args+=(--cache-to=type=inline,mode=max)
+    fi
     # shellcheck disable=SC2086 # intentional splitting of build args string
-    docker buildx build --pull \
-        --cache-from=materialize/ci-builder:"$cache_tag" \
-        --cache-to=type=inline,mode=max \
+    "$DOCKER" buildx build --pull \
+        "${cache_args[@]}" \
         $docker_build_args \
         --tag materialize/ci-builder:"$tag" \
         --tag ghcr.io/materializeinc/materialize/ci-builder:"$tag" \
@@ -181,13 +188,13 @@ case "$cmd" in
         build "$@"
         ;;
     exists)
-        docker manifest inspect "$image_registry"/ci-builder:"$tag" &> /dev/null
+        "$DOCKER" manifest inspect "$image_registry"/ci-builder:"$tag" &> /dev/null
         ;;
     tag)
         echo "$tag"
         ;;
     push)
-        docker login ghcr.io -u materialize-bot --password "$GITHUB_GHCR_TOKEN"
+        "$DOCKER" login ghcr.io -u materialize-bot --password "$GITHUB_GHCR_TOKEN"
         build --push "$@"
         ;;
     run)
@@ -372,20 +379,26 @@ case "$cmd" in
             )
         fi
         if [[ "$(uname -s)" = Linux ]]; then
-            args+=(
-                --user "$(id -u):$(stat -c %g /var/run/docker.sock)"
-            )
+            if [[ "${MZ_DEV_CI_BUILDER_RUNTIME:-docker}" == "podman" ]]; then
+                args+=(--userns=keep-id)
+            else
+                args+=(
+                    --user "$(id -u):$(stat -c %g /var/run/docker.sock)"
+                )
+            fi
 
             if [[ $secrets == "true" ]]; then
                 # Allow Docker-in-Docker by mounting the Docker socket in the
                 # container. Host networking allows us to see ports created by
                 # containers that we launch.
                 args+=(
-                    --volume "/var/run/docker.sock:/var/run/docker.sock"
                     --network host
                     --env "DOCKER_TLS_VERIFY=${DOCKER_TLS_VERIFY-}"
                     --env "DOCKER_HOST=${DOCKER_HOST-}"
                 )
+                if [[ -S /var/run/docker.sock ]]; then
+                    args+=(--volume "/var/run/docker.sock:/var/run/docker.sock")
+                fi
 
                 # Forward Docker configuration too, if available.
                 docker_dir=${DOCKER_CONFIG:-$HOME/.docker}
@@ -431,14 +444,22 @@ case "$cmd" in
         image="$image_registry/ci-builder:$tag"
         # Try downloading the image a few times in case of registry flakiness
         if [[ "${CI:-}" ]]; then
-            if ! docker inspect "$image" > /dev/null 2>&1; then
-                docker pull "$image" || (sleep 3 && docker pull "$image") || (sleep 3 && docker pull "$image") || sleep 3
+            if ! "$DOCKER" inspect "$image" > /dev/null 2>&1; then
+                "$DOCKER" pull "$image" || (sleep 3 && "$DOCKER" pull "$image") || (sleep 3 && "$DOCKER" pull "$image") || sleep 3
             fi
         fi
-        docker run "${args[@]}" "$image" eatmydata "${docker_command[@]}"
+        if [[ "$DOCKER" == "podman" ]]; then
+            # --userns=keep-id already maps the host UID/GID into the
+            # container, so autouseradd is unnecessary.  Override the
+            # entrypoint to skip it.
+            args+=(--entrypoint eatmydata)
+            "$DOCKER" run "${args[@]}" "$image" "${docker_command[@]}"
+        else
+            "$DOCKER" run "${args[@]}" "$image" eatmydata "${docker_command[@]}"
+        fi
         ;;
     root-shell)
-        docker exec --interactive --tty --user 0:0 "$(<"$cid_file")" eatmydata ci/builder/root-shell.sh
+        "$DOCKER" exec --interactive --tty --user 0:0 "$(<"$cid_file")" eatmydata ci/builder/root-shell.sh
         ;;
     *)
         printf "unknown command %q\n" "$cmd"
diff --git a/ci/builder/Dockerfile b/ci/builder/Dockerfile
index be1da20d8591f..eb6b71be277a4 100644
--- a/ci/builder/Dockerfile
+++ b/ci/builder/Dockerfile
@@ -399,6 +399,11 @@ ENV CARGO_HOME=/cargo
 RUN mkdir /cargo && chmod 777 /cargo
 VOLUME /cargo
 
+# Antithesis coverage instrumentation library (used when --antithesis is passed)
+RUN curl -sSL https://antithesis.com/assets/instrumentation/libvoidstar.so \
+      -o /usr/lib/libvoidstar.so \
+    && ldconfig
+
 # Stage 3: Build a lightweight CI Builder image for console/playwright jobs.
 FROM ubuntu:noble-20260324 AS ci-builder-console
 
diff --git a/misc/python/materialize/mzbuild.py b/misc/python/materialize/mzbuild.py
index f653b84abc4a9..2200188139219 100644
--- a/misc/python/materialize/mzbuild.py
+++ b/misc/python/materialize/mzbuild.py
@@ -187,6 +187,7 @@ def __init__(
         sanitizer: Sanitizer,
         image_registry: str,
         image_prefix: str,
+        antithesis: bool = False,
     ):
         self.root = root
         self.arch = arch
@@ -196,6 +197,7 @@ def __init__(
         self.cargo_workspace = cargo.Workspace(root)
         self.image_registry = image_registry
         self.image_prefix = image_prefix
+        self.antithesis = antithesis
 
     def build(
         self,
@@ -513,6 +515,8 @@ def extra(self) -> str:
             flags += "optimized"
         if self.rd.coverage:
             flags += "coverage"
+        if self.rd.antithesis:
+            flags += ["antithesis"]
         if self.rd.sanitizer != Sanitizer.none:
             flags += self.rd.sanitizer.value
         flags.sort()
@@ -547,15 +551,14 @@ def generate_cargo_build_command(
         examples: list[str],
         features: list[str] | None = None,
     ) -> list[str]:
-        rustflags = (
-            rustc_flags.coverage
-            if rd.coverage
-            else (
-                rustc_flags.sanitizer[rd.sanitizer]
-                if rd.sanitizer != Sanitizer.none
-                else ["--cfg=tokio_unstable"]
-            )
-        )
+        if rd.antithesis:
+            rustflags = rustc_flags.antithesis
+        elif rd.coverage:
+            rustflags = rustc_flags.coverage
+        elif rd.sanitizer != Sanitizer.none:
+            rustflags = rustc_flags.sanitizer[rd.sanitizer]
+        else:
+            rustflags = ["--cfg=tokio_unstable"]
         cflags = (
             [
                 f"--target={target(rd.arch)}",
@@ -568,8 +571,8 @@ def generate_cargo_build_command(
             if rd.sanitizer != Sanitizer.none
             else []
         )
-        extra_env = (
-            {
+        if rd.sanitizer != Sanitizer.none:
+            extra_env = {
                 "CFLAGS": " ".join(cflags),
                 "CXXFLAGS": " ".join(cflags),
                 "LDFLAGS": " ".join(cflags),
@@ -582,9 +585,8 @@ def generate_cargo_build_command(
                 "PATH": f"/sanshim:/opt/x-tools/{target(rd.arch)}/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
                 "TSAN_OPTIONS": "report_bugs=0",  # build-scripts fail
             }
-            if rd.sanitizer != Sanitizer.none
-            else {}
-        )
+        else:
+            extra_env = {}
 
         cargo_build = rd.build(
             "build", channel=None, rustflags=rustflags, extra_env=extra_env
@@ -672,7 +674,11 @@ def copy(src: Path, relative_dst: Path) -> None:
             exe_path.parent.mkdir(parents=True, exist_ok=True)
             shutil.copy(src, exe_path)
 
-            if self.strip:
+            if self.rd.antithesis:
+                # Antithesis needs full debug symbols for symbolization.
+                # Don't strip anything.
+                pass
+            elif self.strip:
                 # The debug information is large enough that it slows down CI,
                 # since we're packaging these binaries up into Docker images and
                 # shipping them around.
@@ -945,6 +951,7 @@ def _build_locked(
             "ARCH_GCC": str(self.image.rd.arch),
             "ARCH_GO": self.image.rd.arch.go_str(),
             "CI_SANITIZER": str(self.image.rd.sanitizer),
+            "ANTITHESIS": "1" if self.image.rd.antithesis else "",
         }
         f = self.write_dockerfile()
 
@@ -1416,6 +1423,7 @@ def __init__(
         sanitizer: Sanitizer = Sanitizer.none,
         image_registry: str = image_registry(),
         image_prefix: str = "",
+        antithesis: bool = False,
     ):
         self.rd = RepositoryDetails(
             root,
@@ -1425,6 +1433,7 @@ def __init__(
             sanitizer,
             image_registry,
             image_prefix,
+            antithesis=antithesis,
         )
         self.images: dict[str, Image] = {}
         self.compositions: dict[str, Path] = {}
@@ -1517,6 +1526,12 @@ def install_arguments(parser: argparse.ArgumentParser) -> None:
             default="",
             help="a prefix to apply to all Docker image names",
         )
+        parser.add_argument(
+            "--antithesis",
+            help="whether to enable Antithesis coverage instrumentation",
+            default=ui.env_is_truthy("CI_ANTITHESIS"),
+            action="store_true",
+        )
 
     @classmethod
     def from_arguments(cls, root: Path, args: argparse.Namespace) -> "Repository":
@@ -1544,6 +1559,7 @@ def from_arguments(cls, root: Path, args: argparse.Namespace) -> "Repository":
             image_registry=args.image_registry,
             image_prefix=args.image_prefix,
             arch=args.arch,
+            antithesis=args.antithesis,
         )
 
     @property
diff --git a/misc/python/materialize/rustc_flags.py b/misc/python/materialize/rustc_flags.py
index 6353f83d3b68a..f6aac45573e14 100644
--- a/misc/python/materialize/rustc_flags.py
+++ b/misc/python/materialize/rustc_flags.py
@@ -25,6 +25,20 @@
 ]
 
 
+# Flags to enable Antithesis coverage instrumentation.
+# Requires libvoidstar.so at /usr/lib/ (installed in ci-builder and
+# the materialized Docker image).
+# See: https://antithesis.com/docs/using_antithesis/sdk/rust/instrumentation/
+antithesis = [
+    "-Ccodegen-units=1",
+    "-Cpasses=sancov-module",
+    "-Cllvm-args=-sanitizer-coverage-level=3",
+    "-Cllvm-args=-sanitizer-coverage-trace-pc-guard",
+    "-Clink-args=-Wl,--build-id",
+    "-lvoidstar",
+]
+
+
 class Sanitizer(Enum):
     """What sanitizer to use"""
 
diff --git a/src/materialized/ci/Dockerfile b/src/materialized/ci/Dockerfile
index 18686251a7b07..e06aaf6bad0cf 100644
--- a/src/materialized/ci/Dockerfile
+++ b/src/materialized/ci/Dockerfile
@@ -20,6 +20,17 @@ COPY materialized entrypoint.sh /usr/local/bin/
 USER root
 RUN ln -s /usr/local/bin/materialized /usr/local/bin/environmentd \
   && ln -s /usr/local/bin/materialized /usr/local/bin/clusterd
+
+# Antithesis instrumentation (conditional on --build-arg ANTITHESIS=1)
+ARG ANTITHESIS
+RUN if [ -n "$ANTITHESIS" ]; then \
+      curl -sSL https://antithesis.com/assets/instrumentation/libvoidstar.so \
+        -o /usr/lib/libvoidstar.so \
+      && ldconfig \
+      && mkdir -p /symbols \
+      && ln -s /usr/local/bin/materialized /symbols/materialized; \
+    fi
+
 USER materialize
 
 ENTRYPOINT ["tini", "--", "entrypoint.sh"]
diff --git a/test/antithesis/export-compose.py b/test/antithesis/export-compose.py
index 5b487a5485bc2..6d7e463d564a2 100644
--- a/test/antithesis/export-compose.py
+++ b/test/antithesis/export-compose.py
@@ -22,7 +22,7 @@
 # Map mzbuild names → image references for the Antithesis compose.
 # Public images for infra; local build tag for the workload.
 MZBUILD_TO_IMAGE = {
-    "materialized": "materialize/materialized:latest",
+    "materialized": "materialize-materialized:latest",
     "postgres": "postgres:17.7",
     "minio": "minio/minio:latest",
     "antithesis-workload": "materialize-workload:latest",

From 2d9ae67499ff0f1e8842467803e71dcd9d513812 Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Mon, 11 May 2026 13:48:21 -0400
Subject: [PATCH 04/65] test/antithesis: consolidate antithesis/ into
 test/antithesis/

---
 antithesis/AGENTS.md                          | 15 -------------
 test/antithesis/AGENTS.md                     | 21 +++++++++++++++++++
 {antithesis => test/antithesis}/Makefile      |  6 +++---
 .../antithesis}/config/Dockerfile             |  0
 .../antithesis}/config/docker-compose.yaml    |  0
 .../antithesis}/scratchbook/bug-candidates.md |  0
 .../scratchbook/deployment-topology.md        |  0
 .../scratchbook/existing-assertions.md        |  0
 .../catalog-recovery-consistency.md           |  0
 .../properties/command-channel-ordering.md    |  0
 .../compute-replica-epoch-isolation.md        |  0
 .../critical-reader-fence-linearization.md    |  0
 .../properties/deployment-lag-detection.md    |  0
 .../properties/deployment-promotion-safety.md |  0
 .../epoch-fencing-prevents-split-brain.md     |  0
 .../properties/fault-recovery-exercised.md    |  0
 .../properties/group-commit-toctou-safety.md  |  0
 .../idempotent-write-under-indeterminate.md   |  0
 .../properties/mv-reflects-source-updates.md  |  0
 .../properties/peek-lifecycle-exactly-once.md |  0
 .../properties/persist-cas-monotonicity.md    |  0
 .../properties/source-ingestion-progress.md   |  0
 .../storage-command-replay-idempotent.md      |  0
 .../properties/strict-serializable-reads.md   |  0
 .../properties/tombstone-sealing-finality.md  |  0
 .../scratchbook/property-catalog.md           |  0
 .../scratchbook/property-relationships.md     |  0
 .../antithesis}/scratchbook/sut-analysis.md   |  0
 28 files changed, 24 insertions(+), 18 deletions(-)
 delete mode 100644 antithesis/AGENTS.md
 create mode 100644 test/antithesis/AGENTS.md
 rename {antithesis => test/antithesis}/Makefile (96%)
 rename {antithesis => test/antithesis}/config/Dockerfile (100%)
 rename {antithesis => test/antithesis}/config/docker-compose.yaml (100%)
 rename {antithesis => test/antithesis}/scratchbook/bug-candidates.md (100%)
 rename {antithesis => test/antithesis}/scratchbook/deployment-topology.md (100%)
 rename {antithesis => test/antithesis}/scratchbook/existing-assertions.md (100%)
 rename {antithesis => test/antithesis}/scratchbook/properties/catalog-recovery-consistency.md (100%)
 rename {antithesis => test/antithesis}/scratchbook/properties/command-channel-ordering.md (100%)
 rename {antithesis => test/antithesis}/scratchbook/properties/compute-replica-epoch-isolation.md (100%)
 rename {antithesis => test/antithesis}/scratchbook/properties/critical-reader-fence-linearization.md (100%)
 rename {antithesis => test/antithesis}/scratchbook/properties/deployment-lag-detection.md (100%)
 rename {antithesis => test/antithesis}/scratchbook/properties/deployment-promotion-safety.md (100%)
 rename {antithesis => test/antithesis}/scratchbook/properties/epoch-fencing-prevents-split-brain.md (100%)
 rename {antithesis => test/antithesis}/scratchbook/properties/fault-recovery-exercised.md (100%)
 rename {antithesis => test/antithesis}/scratchbook/properties/group-commit-toctou-safety.md (100%)
 rename {antithesis => test/antithesis}/scratchbook/properties/idempotent-write-under-indeterminate.md (100%)
 rename {antithesis => test/antithesis}/scratchbook/properties/mv-reflects-source-updates.md (100%)
 rename {antithesis => test/antithesis}/scratchbook/properties/peek-lifecycle-exactly-once.md (100%)
 rename {antithesis => test/antithesis}/scratchbook/properties/persist-cas-monotonicity.md (100%)
 rename {antithesis => test/antithesis}/scratchbook/properties/source-ingestion-progress.md (100%)
 rename {antithesis => test/antithesis}/scratchbook/properties/storage-command-replay-idempotent.md (100%)
 rename {antithesis => test/antithesis}/scratchbook/properties/strict-serializable-reads.md (100%)
 rename {antithesis => test/antithesis}/scratchbook/properties/tombstone-sealing-finality.md (100%)
 rename {antithesis => test/antithesis}/scratchbook/property-catalog.md (100%)
 rename {antithesis => test/antithesis}/scratchbook/property-relationships.md (100%)
 rename {antithesis => test/antithesis}/scratchbook/sut-analysis.md (100%)

diff --git a/antithesis/AGENTS.md b/antithesis/AGENTS.md
deleted file mode 100644
index ff80e8994fb67..0000000000000
--- a/antithesis/AGENTS.md
+++ /dev/null
@@ -1,15 +0,0 @@
-This directory contains files relevant to running tests in Antithesis.
-
-Use the `antithesis-setup` skill to scaffold and manage this directory. Use the `antithesis-research` skill to analyze the system and build a property catalog. Use the `antithesis-workload` skill to implement assertions and test commands.
-
-**setup-complete.sh**
-Inject this script into a Dockerfile to notify Antithesis that setup is complete. This script should only run once the system under test is ready for testing. Antithesis will not run any test commands until it receives this event.
-
-**config**
-This directory contains the `docker-compose.yaml` file used to bring up this system within the Antithesis environment, along with any closely related config files.
-
-**scratchbook**
-This directory is the Antithesis scratchbook for the codebase. It contains documents such as system analysis, property catalogs, topology plans, per-property evidence files (in `scratchbook/properties/`), property relationship maps, and other persistent integration notes. Keep it up to date as Antithesis-related decisions change.
-
-**test**
-This directory contains test templates. A test template is a directory containing test command executable files. Each test command must have a valid prefix: `parallel_driver_, singleton_driver_, serial_driver_, first_, eventually_, finally_, anytime_`. Prefixes constrain when and how commands are composed in a single timeline. Files or subdirectories prefixed with `helper_` are ignored by Test Composer and can be used for helper scripts kept alongside the commands.
diff --git a/test/antithesis/AGENTS.md b/test/antithesis/AGENTS.md
new file mode 100644
index 0000000000000..b93956df1ea94
--- /dev/null
+++ b/test/antithesis/AGENTS.md
@@ -0,0 +1,21 @@
+Files relevant to running Materialize under Antithesis.
+
+Use the `antithesis-setup` skill to scaffold and manage this directory. Use the `antithesis-research` skill to analyze the system and build a property catalog. Use the `antithesis-workload` skill to implement assertions and test commands.
+
+**mzcompose.py**
+Source of truth for the Antithesis topology. Standard mzcompose composition: services (`postgres-metadata`, `minio`, `redpanda`, `materialized`, `workload`), dependencies, env, ports. The generated `config/docker-compose.yaml` is derived from this.
+
+**export-compose.py**
+Renders `mzcompose.py` into a flat docker-compose YAML that Antithesis can consume. Images are emitted as `ghcr.io/materializeinc/materialize/<name>:mzbuild-<fingerprint>` refs that Antithesis pulls directly from public GHCR.
+
+**workload/**
+Mzbuild image (`antithesis-workload`) for the Python test driver. Dockerfile, entrypoint, and test-template scripts (`test/*.sh`) live here. Test command files must be prefixed with one of `parallel_driver_`, `singleton_driver_`, `serial_driver_`, `first_`, `eventually_`, `finally_`, `anytime_`; files prefixed with `helper_` are ignored by Test Composer.
+
+**config/**
+Mzbuild image (`antithesis-config`) — a `FROM scratch` container holding the generated `docker-compose.yaml`. This is the image Antithesis points at to bring up the environment.
+
+**scratchbook/**
+Antithesis scratchbook: system analysis, property catalog, topology plans, per-property evidence files (in `scratchbook/properties/`), property relationship maps, persistent integration notes. Keep up to date as Antithesis-related decisions change.
+
+**setup-complete.sh** (in `workload/`)
+Inject this script into a Dockerfile to notify Antithesis that setup is complete. Should only run once the system under test is ready for testing — Antithesis will not run test commands until it receives this event.
diff --git a/antithesis/Makefile b/test/antithesis/Makefile
similarity index 96%
rename from antithesis/Makefile
rename to test/antithesis/Makefile
index 25bf6408cf927..bee47a5d6f2c6 100644
--- a/antithesis/Makefile
+++ b/test/antithesis/Makefile
@@ -13,7 +13,7 @@ SHELL := /usr/bin/env bash
 .SHELLFLAGS := -eu -o pipefail -c
 
 PROJECT   := materialize
-REPO_ROOT := $(realpath $(dir $(lastword $(MAKEFILE_LIST)))/..)
+REPO_ROOT := $(realpath $(dir $(lastword $(MAKEFILE_LIST)))/../..)
 
 ifndef RUNTIME
   RUNTIME := $(shell command -v podman >/dev/null 2>&1 && echo podman || (command -v docker >/dev/null 2>&1 && echo docker || echo none))
@@ -36,8 +36,8 @@ REGISTRY_PATH ?= /molten-verve-216720/materialize-repository
 # ---------------------------------------------------------------------------
 .PHONY: export-compose
 export-compose:
-	cd $(REPO_ROOT) && $(REPO_ROOT)/bin/pyactivate test/antithesis/export-compose.py > antithesis/config/docker-compose.yaml
-	@echo "Wrote config/docker-compose.yaml"
+	cd $(REPO_ROOT) && $(REPO_ROOT)/bin/pyactivate test/antithesis/export-compose.py > test/antithesis/config/docker-compose.yaml
+	@echo "Wrote test/antithesis/config/docker-compose.yaml"
 
 # ---------------------------------------------------------------------------
 # Build — build images that don't have public equivalents.
diff --git a/antithesis/config/Dockerfile b/test/antithesis/config/Dockerfile
similarity index 100%
rename from antithesis/config/Dockerfile
rename to test/antithesis/config/Dockerfile
diff --git a/antithesis/config/docker-compose.yaml b/test/antithesis/config/docker-compose.yaml
similarity index 100%
rename from antithesis/config/docker-compose.yaml
rename to test/antithesis/config/docker-compose.yaml
diff --git a/antithesis/scratchbook/bug-candidates.md b/test/antithesis/scratchbook/bug-candidates.md
similarity index 100%
rename from antithesis/scratchbook/bug-candidates.md
rename to test/antithesis/scratchbook/bug-candidates.md
diff --git a/antithesis/scratchbook/deployment-topology.md b/test/antithesis/scratchbook/deployment-topology.md
similarity index 100%
rename from antithesis/scratchbook/deployment-topology.md
rename to test/antithesis/scratchbook/deployment-topology.md
diff --git a/antithesis/scratchbook/existing-assertions.md b/test/antithesis/scratchbook/existing-assertions.md
similarity index 100%
rename from antithesis/scratchbook/existing-assertions.md
rename to test/antithesis/scratchbook/existing-assertions.md
diff --git a/antithesis/scratchbook/properties/catalog-recovery-consistency.md b/test/antithesis/scratchbook/properties/catalog-recovery-consistency.md
similarity index 100%
rename from antithesis/scratchbook/properties/catalog-recovery-consistency.md
rename to test/antithesis/scratchbook/properties/catalog-recovery-consistency.md
diff --git a/antithesis/scratchbook/properties/command-channel-ordering.md b/test/antithesis/scratchbook/properties/command-channel-ordering.md
similarity index 100%
rename from antithesis/scratchbook/properties/command-channel-ordering.md
rename to test/antithesis/scratchbook/properties/command-channel-ordering.md
diff --git a/antithesis/scratchbook/properties/compute-replica-epoch-isolation.md b/test/antithesis/scratchbook/properties/compute-replica-epoch-isolation.md
similarity index 100%
rename from antithesis/scratchbook/properties/compute-replica-epoch-isolation.md
rename to test/antithesis/scratchbook/properties/compute-replica-epoch-isolation.md
diff --git a/antithesis/scratchbook/properties/critical-reader-fence-linearization.md b/test/antithesis/scratchbook/properties/critical-reader-fence-linearization.md
similarity index 100%
rename from antithesis/scratchbook/properties/critical-reader-fence-linearization.md
rename to test/antithesis/scratchbook/properties/critical-reader-fence-linearization.md
diff --git a/antithesis/scratchbook/properties/deployment-lag-detection.md b/test/antithesis/scratchbook/properties/deployment-lag-detection.md
similarity index 100%
rename from antithesis/scratchbook/properties/deployment-lag-detection.md
rename to test/antithesis/scratchbook/properties/deployment-lag-detection.md
diff --git a/antithesis/scratchbook/properties/deployment-promotion-safety.md b/test/antithesis/scratchbook/properties/deployment-promotion-safety.md
similarity index 100%
rename from antithesis/scratchbook/properties/deployment-promotion-safety.md
rename to test/antithesis/scratchbook/properties/deployment-promotion-safety.md
diff --git a/antithesis/scratchbook/properties/epoch-fencing-prevents-split-brain.md b/test/antithesis/scratchbook/properties/epoch-fencing-prevents-split-brain.md
similarity index 100%
rename from antithesis/scratchbook/properties/epoch-fencing-prevents-split-brain.md
rename to test/antithesis/scratchbook/properties/epoch-fencing-prevents-split-brain.md
diff --git a/antithesis/scratchbook/properties/fault-recovery-exercised.md b/test/antithesis/scratchbook/properties/fault-recovery-exercised.md
similarity index 100%
rename from antithesis/scratchbook/properties/fault-recovery-exercised.md
rename to test/antithesis/scratchbook/properties/fault-recovery-exercised.md
diff --git a/antithesis/scratchbook/properties/group-commit-toctou-safety.md b/test/antithesis/scratchbook/properties/group-commit-toctou-safety.md
similarity index 100%
rename from antithesis/scratchbook/properties/group-commit-toctou-safety.md
rename to test/antithesis/scratchbook/properties/group-commit-toctou-safety.md
diff --git a/antithesis/scratchbook/properties/idempotent-write-under-indeterminate.md b/test/antithesis/scratchbook/properties/idempotent-write-under-indeterminate.md
similarity index 100%
rename from antithesis/scratchbook/properties/idempotent-write-under-indeterminate.md
rename to test/antithesis/scratchbook/properties/idempotent-write-under-indeterminate.md
diff --git a/antithesis/scratchbook/properties/mv-reflects-source-updates.md b/test/antithesis/scratchbook/properties/mv-reflects-source-updates.md
similarity index 100%
rename from antithesis/scratchbook/properties/mv-reflects-source-updates.md
rename to test/antithesis/scratchbook/properties/mv-reflects-source-updates.md
diff --git a/antithesis/scratchbook/properties/peek-lifecycle-exactly-once.md b/test/antithesis/scratchbook/properties/peek-lifecycle-exactly-once.md
similarity index 100%
rename from antithesis/scratchbook/properties/peek-lifecycle-exactly-once.md
rename to test/antithesis/scratchbook/properties/peek-lifecycle-exactly-once.md
diff --git a/antithesis/scratchbook/properties/persist-cas-monotonicity.md b/test/antithesis/scratchbook/properties/persist-cas-monotonicity.md
similarity index 100%
rename from antithesis/scratchbook/properties/persist-cas-monotonicity.md
rename to test/antithesis/scratchbook/properties/persist-cas-monotonicity.md
diff --git a/antithesis/scratchbook/properties/source-ingestion-progress.md b/test/antithesis/scratchbook/properties/source-ingestion-progress.md
similarity index 100%
rename from antithesis/scratchbook/properties/source-ingestion-progress.md
rename to test/antithesis/scratchbook/properties/source-ingestion-progress.md
diff --git a/antithesis/scratchbook/properties/storage-command-replay-idempotent.md b/test/antithesis/scratchbook/properties/storage-command-replay-idempotent.md
similarity index 100%
rename from antithesis/scratchbook/properties/storage-command-replay-idempotent.md
rename to test/antithesis/scratchbook/properties/storage-command-replay-idempotent.md
diff --git a/antithesis/scratchbook/properties/strict-serializable-reads.md b/test/antithesis/scratchbook/properties/strict-serializable-reads.md
similarity index 100%
rename from antithesis/scratchbook/properties/strict-serializable-reads.md
rename to test/antithesis/scratchbook/properties/strict-serializable-reads.md
diff --git a/antithesis/scratchbook/properties/tombstone-sealing-finality.md b/test/antithesis/scratchbook/properties/tombstone-sealing-finality.md
similarity index 100%
rename from antithesis/scratchbook/properties/tombstone-sealing-finality.md
rename to test/antithesis/scratchbook/properties/tombstone-sealing-finality.md
diff --git a/antithesis/scratchbook/property-catalog.md b/test/antithesis/scratchbook/property-catalog.md
similarity index 100%
rename from antithesis/scratchbook/property-catalog.md
rename to test/antithesis/scratchbook/property-catalog.md
diff --git a/antithesis/scratchbook/property-relationships.md b/test/antithesis/scratchbook/property-relationships.md
similarity index 100%
rename from antithesis/scratchbook/property-relationships.md
rename to test/antithesis/scratchbook/property-relationships.md
diff --git a/antithesis/scratchbook/sut-analysis.md b/test/antithesis/scratchbook/sut-analysis.md
similarity index 100%
rename from antithesis/scratchbook/sut-analysis.md
rename to test/antithesis/scratchbook/sut-analysis.md

From 40100e8ee92c80a586531075c8cdca73437e9c9a Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Mon, 11 May 2026 13:49:53 -0400
Subject: [PATCH 05/65] test/antithesis: add antithesis-config mzbuild image
 (FROM scratch + compose YAML)

---
 test/antithesis/config/Dockerfile  | 13 +++++++++++++
 test/antithesis/config/mzbuild.yml | 19 +++++++++++++++++++
 2 files changed, 32 insertions(+)
 create mode 100644 test/antithesis/config/mzbuild.yml

diff --git a/test/antithesis/config/Dockerfile b/test/antithesis/config/Dockerfile
index fb59d4a2bd588..386049db7e8e5 100644
--- a/test/antithesis/config/Dockerfile
+++ b/test/antithesis/config/Dockerfile
@@ -1,2 +1,15 @@
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+# Antithesis config image: a FROM-scratch tarball holding the resolved
+# docker-compose.yaml that Antithesis uses to bring up the system under
+# test. See mzbuild.yml for regeneration instructions.
+
 FROM scratch
 COPY docker-compose.yaml /
diff --git a/test/antithesis/config/mzbuild.yml b/test/antithesis/config/mzbuild.yml
new file mode 100644
index 0000000000000..899d620d1285f
--- /dev/null
+++ b/test/antithesis/config/mzbuild.yml
@@ -0,0 +1,19 @@
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+# FROM-scratch image holding the resolved docker-compose.yaml for the
+# Antithesis environment. Antithesis pulls this image and reads the compose
+# spec from `/docker-compose.yaml` to bring up the system under test.
+#
+# The compose file is generated from test/antithesis/mzcompose.py via
+# `bin/pyactivate test/antithesis/export-compose.py`. Re-run that whenever
+# the composition topology changes; CI verifies the committed copy is up to
+# date.
+
+name: antithesis-config

From 92569ac49eb631642d59301b676685cc000ff9c8 Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Mon, 11 May 2026 13:52:00 -0400
Subject: [PATCH 06/65] test/antithesis: add copyright headers

---
 test/antithesis/config/docker-compose.yaml          | 13 +++++++++++++
 test/antithesis/export-compose.py                   | 10 ++++++++++
 test/antithesis/workload/Dockerfile                 |  9 +++++++++
 test/antithesis/workload/mzbuild.yml                |  9 +++++++++
 test/antithesis/workload/setup-complete.sh          | 10 ++++++++++
 .../workload/test/anytime_health_check.sh           | 10 ++++++++++
 test/antithesis/workload/workload-entrypoint.sh     | 10 ++++++++++
 7 files changed, 71 insertions(+)

diff --git a/test/antithesis/config/docker-compose.yaml b/test/antithesis/config/docker-compose.yaml
index 004fc60b245d9..f1f359ad7dfef 100644
--- a/test/antithesis/config/docker-compose.yaml
+++ b/test/antithesis/config/docker-compose.yaml
@@ -1,3 +1,16 @@
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+# GENERATED FILE — do not edit. Regenerate via:
+#   bin/pyactivate test/antithesis/export-compose.py > test/antithesis/config/docker-compose.yaml
+# Source of truth: test/antithesis/mzcompose.py.
+
 services:
   postgres-metadata:
     command:
diff --git a/test/antithesis/export-compose.py b/test/antithesis/export-compose.py
index 6d7e463d564a2..80b20b2d9e9f6 100644
--- a/test/antithesis/export-compose.py
+++ b/test/antithesis/export-compose.py
@@ -1,4 +1,14 @@
 #!/usr/bin/env python3
+
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
 """Export the resolved docker-compose YAML for the Antithesis composition.
 
 Loads the mzcompose composition and dumps the compose dict to stdout as
diff --git a/test/antithesis/workload/Dockerfile b/test/antithesis/workload/Dockerfile
index 804cb1b3009ec..b72a6b541d818 100644
--- a/test/antithesis/workload/Dockerfile
+++ b/test/antithesis/workload/Dockerfile
@@ -1,3 +1,12 @@
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
 # Antithesis workload client for Materialize.
 #
 # Python-based test driver that connects to materialized via pgwire,
diff --git a/test/antithesis/workload/mzbuild.yml b/test/antithesis/workload/mzbuild.yml
index beed6bf84e93b..f62b4c073bb00 100644
--- a/test/antithesis/workload/mzbuild.yml
+++ b/test/antithesis/workload/mzbuild.yml
@@ -1 +1,10 @@
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
 name: antithesis-workload
diff --git a/test/antithesis/workload/setup-complete.sh b/test/antithesis/workload/setup-complete.sh
index 59384ae9ba2b4..ecae58fa23e44 100755
--- a/test/antithesis/workload/setup-complete.sh
+++ b/test/antithesis/workload/setup-complete.sh
@@ -1,4 +1,14 @@
 #!/usr/bin/env bash
+
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
 set -euo pipefail
 
 # Run this script to inform Antithesis that it can start running Test Composer
diff --git a/test/antithesis/workload/test/anytime_health_check.sh b/test/antithesis/workload/test/anytime_health_check.sh
index f7d743ebc4cd7..641aed971be93 100755
--- a/test/antithesis/workload/test/anytime_health_check.sh
+++ b/test/antithesis/workload/test/anytime_health_check.sh
@@ -1,4 +1,14 @@
 #!/usr/bin/env bash
+
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
 set -euo pipefail
 
 # Basic health check — verifies materialized is responding to SQL.
diff --git a/test/antithesis/workload/workload-entrypoint.sh b/test/antithesis/workload/workload-entrypoint.sh
index 0f5b012c3ad9e..f37eb275ab1e7 100755
--- a/test/antithesis/workload/workload-entrypoint.sh
+++ b/test/antithesis/workload/workload-entrypoint.sh
@@ -1,4 +1,14 @@
 #!/usr/bin/env bash
+
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
 set -euo pipefail
 
 # Wait for materialized to be ready before signaling setup_complete.

From 0a7d801d5710ec6c174f685229e150748e38474a Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Mon, 11 May 2026 13:55:17 -0400
Subject: [PATCH 07/65] test/antithesis: rewrite export-compose.py to use
 mzbuild specs

---
 test/antithesis/config/docker-compose.yaml |  36 ++-
 test/antithesis/export-compose.py          | 255 ++++++++++++++-------
 test/antithesis/mzcompose.py               |  20 +-
 3 files changed, 203 insertions(+), 108 deletions(-)

diff --git a/test/antithesis/config/docker-compose.yaml b/test/antithesis/config/docker-compose.yaml
index f1f359ad7dfef..dc75d1e5f2ef8 100644
--- a/test/antithesis/config/docker-compose.yaml
+++ b/test/antithesis/config/docker-compose.yaml
@@ -44,12 +44,27 @@ services:
     entrypoint:
     - sh
     - -c
-    - 'echo "CREATE ROLE root WITH LOGIN PASSWORD ''root'';CREATE DATABASE root;GRANT
-      ALL PRIVILEGES ON DATABASE root TO root;\c root;CREATE SCHEMA IF NOT EXISTS
-      consensus AUTHORIZATION root;CREATE SCHEMA IF NOT EXISTS adapter AUTHORIZATION
-      root;CREATE SCHEMA IF NOT EXISTS storage AUTHORIZATION root;CREATE SCHEMA IF
-      NOT EXISTS tsoracle AUTHORIZATION root;GRANT ALL PRIVILEGES ON SCHEMA public
-      TO root;" > /docker-entrypoint-initdb.d/z_setup_materialize.sql
+    - 'cat <<''SQL'' > /docker-entrypoint-initdb.d/z_setup_materialize.sql
+
+      CREATE ROLE root WITH LOGIN PASSWORD ''root'';
+
+      CREATE DATABASE root;
+
+      GRANT ALL PRIVILEGES ON DATABASE root TO root;
+
+      \c root
+
+      CREATE SCHEMA IF NOT EXISTS consensus AUTHORIZATION root;
+
+      CREATE SCHEMA IF NOT EXISTS adapter AUTHORIZATION root;
+
+      CREATE SCHEMA IF NOT EXISTS storage AUTHORIZATION root;
+
+      CREATE SCHEMA IF NOT EXISTS tsoracle AUTHORIZATION root;
+
+      GRANT ALL PRIVILEGES ON SCHEMA public TO root;
+
+      SQL
 
       exec docker-entrypoint.sh "$$@"'
     - --
@@ -142,7 +157,6 @@ services:
     environment:
     - MZ_NO_TELEMETRY=1
     - MZ_NO_BUILTIN_CONSOLE=1
-    - MZ_EAT_MY_DATA=1
     - MZ_TEST_ONLY_DUMMY_SEGMENT_CLIENT=true
     - MZ_SOFT_ASSERTIONS=1
     - MZ_ORCHESTRATOR_PROCESS_TCP_PROXY_LISTEN_ADDR=0.0.0.0
@@ -153,8 +167,6 @@ services:
     - MZ_AWS_CONNECTION_ROLE_ARN=arn:aws:iam::123456789000:role/MaterializeConnection
     - MZ_AWS_EXTERNAL_ID_PREFIX=eb5cb59b-e2fe-41f3-87ca-d2176a495345
     - MZ_CATALOG_STORE=persist
-    - MZ_LOG_FILTER
-    - CLUSTERD_LOG_FILTER
     - 'MZ_CLUSTER_REPLICA_SIZES={"bootstrap": {"cpu_exclusive": false, "cpu_limit":
       null, "credits_per_hour": "1", "disabled": false, "disk_limit": null, "is_cc":
       true, "memory_limit": "4 GiB", "scale": 1, "workers": 1}, "scale=2,workers=4":
@@ -284,7 +296,7 @@ services:
     - MZ_BOOTSTRAP_DEFAULT_CLUSTER_REPLICATION_FACTOR=1
     - COCKROACH_ENGINE_MAX_SYNC_DURATION_DEFAULT=120s
     - COCKROACH_LOG_MAX_SYNC_DURATION=120s
-    - MZ_SYSTEM_PARAMETER_DEFAULT=unsafe_enable_unsafe_functions=true;allow_real_time_recency=true;constraint_based_timestamp_selection=verify;enable_compute_peek_response_stash=true;enable_0dt_deployment_panic_after_timeout=true;enable_0dt_deployment_sources=true;enable_alter_swap=true;enable_case_literal_transform=false;enable_cast_elimination=true;enable_coalesce_case_transform=true;enable_columnar_lgalloc=false;enable_columnation_lgalloc=false;enable_compute_correction_v2=true;enable_compute_logical_backpressure=true;enable_connection_validation_syntax=true;enable_copy_to_expr=true;enable_copy_from_remote=true;enable_create_table_from_source=true;enable_eager_delta_joins=true;enable_envelope_debezium_in_subscribe=true;enable_expressions_in_limit_syntax=true;enable_iceberg_sink=true;enable_introspection_subscribes=true;enable_kafka_sink_partition_by=true;enable_lgalloc=false;enable_load_generator_counter=true;enable_logical_compaction_window=true;enable_multi_worker_storage_persist_sink=true;enable_multi_replica_sources=true;enable_rbac_checks=true;enable_reduce_mfp_fusion=true;enable_refresh_every_mvs=true;enable_replacement_materialized_views=true;enable_cluster_schedule_refresh=true;enable_sql_server_source=true;enable_s3_tables_region_check=false;enable_statement_lifecycle_logging=true;enable_storage_introspection_logs=true;enable_compute_temporal_bucketing=true;enable_variadic_left_join_lowering=true;enable_worker_core_affinity=true;grpc_client_http2_keep_alive_timeout=5s;ore_overflowing_behavior=panic;unsafe_enable_table_keys=true;with_0dt_deployment_max_wait=1800s;persist_next_listen_batch_retryer_clamp=16s;persist_next_listen_batch_retryer_initial_backoff=100ms;persist_next_listen_batch_retryer_fixed_sleep=1200ms;persist_enable_arrow_lgalloc_noncc_sizes=true;persist_enable_s3_lgalloc_noncc_sizes=true;compute_correction_v2_chain_proportionality=3;compute_correction_v2_chunk_size=8192;compute_dataflow_max_inflight_bytes=134217728;compute_hydration_concurrency=2;compute_replica_expiration_offset=3d;compute_apply_column_demands=true;compute_peek_response_stash_threshold_bytes=1048576;compute_subscribe_snapshot_optimization=true;enable_compute_sync_mv_sink=true;enable_password_auth=true;enable_frontend_peek_sequencing=true;enable_frontend_subscribes=true;enable_upsert_v2=false;default_timestamp_interval=1s;force_source_table_syntax=false;persist_batch_columnar_format=structured;persist_batch_delete_enabled=true;persist_batch_structured_order=true;persist_batch_builder_structured=true;persist_batch_structured_key_lower_len=256;persist_batch_max_run_len=4;persist_catalog_force_compaction_fuel=1024;persist_catalog_force_compaction_wait=1s;persist_stats_audit_percent=100;persist_stats_audit_panic=true;persist_encoding_enable_dictionary=true;persist_fast_path_limit=1000;persist_fast_path_order=true;persist_gc_use_active_gc=true;persist_gc_min_versions=16;persist_gc_max_versions=128000;persist_inline_writes_single_max_bytes=4096;persist_inline_writes_total_max_bytes=1048576;persist_pubsub_client_enabled=true;persist_pubsub_push_diff_enabled=true;persist_record_compactions=true;persist_record_schema_id=true;persist_rollup_use_active_rollup=true;persist_blob_target_size=16777216;persist_compaction_memory_bound_bytes=83886080;persist_enable_incremental_compaction=true;persist_use_critical_since_catalog=true;persist_use_critical_since_snapshot=false;persist_use_critical_since_source=false;persist_part_decode_format=arrow;persist_blob_cache_scale_with_threads=true;persist_state_update_lease_timeout=1s;persist_validate_part_bounds_on_read=false;persist_validate_part_bounds_on_write=false;statement_logging_default_sample_rate=1.0;statement_logging_max_data_credit=;statement_logging_max_sample_rate=1.0;statement_logging_target_data_rate=;storage_reclock_to_latest=true;storage_source_decode_fuel=100000;storage_statistics_collection_interval=1000;storage_statistics_interval=2000;storage_use_continual_feedback_upsert=true;default_cluster_replication_factor=1
+    - MZ_SYSTEM_PARAMETER_DEFAULT=unsafe_enable_unsafe_functions=true;allow_real_time_recency=true;constraint_based_timestamp_selection=verify;enable_compute_peek_response_stash=true;enable_0dt_deployment_panic_after_timeout=true;enable_0dt_deployment_sources=true;enable_alter_swap=true;enable_case_literal_transform=false;enable_cast_elimination=true;enable_coalesce_case_transform=true;enable_columnar_lgalloc=false;enable_columnation_lgalloc=false;enable_compute_correction_v2=true;enable_compute_logical_backpressure=true;enable_connection_validation_syntax=true;enable_copy_to_expr=true;enable_copy_from_remote=true;enable_create_table_from_source=true;enable_eager_delta_joins=true;enable_envelope_debezium_in_subscribe=true;enable_expressions_in_limit_syntax=true;enable_iceberg_sink=true;enable_introspection_subscribes=true;enable_kafka_sink_partition_by=true;enable_lgalloc=false;enable_load_generator_counter=true;enable_logical_compaction_window=true;enable_multi_worker_storage_persist_sink=true;enable_multi_replica_sources=true;enable_rbac_checks=true;enable_reduce_mfp_fusion=true;enable_refresh_every_mvs=true;enable_replacement_materialized_views=true;enable_cluster_schedule_refresh=true;enable_sql_server_source=true;enable_s3_tables_region_check=false;enable_statement_lifecycle_logging=true;enable_storage_introspection_logs=true;enable_compute_temporal_bucketing=true;enable_variadic_left_join_lowering=true;enable_worker_core_affinity=true;grpc_client_http2_keep_alive_timeout=5s;ore_overflowing_behavior=panic;unsafe_enable_table_keys=true;with_0dt_deployment_max_wait=1800s;persist_next_listen_batch_retryer_clamp=16s;persist_next_listen_batch_retryer_initial_backoff=100ms;persist_next_listen_batch_retryer_fixed_sleep=1200ms;persist_enable_arrow_lgalloc_noncc_sizes=true;persist_enable_s3_lgalloc_noncc_sizes=true;compute_correction_v2_chain_proportionality=3;compute_correction_v2_chunk_size=8192;compute_dataflow_max_inflight_bytes=134217728;compute_hydration_concurrency=2;compute_replica_expiration_offset=3d;compute_apply_column_demands=true;compute_peek_response_stash_threshold_bytes=1048576;compute_subscribe_snapshot_optimization=true;enable_compute_sync_mv_sink=true;enable_password_auth=true;enable_frontend_peek_sequencing=true;enable_frontend_subscribes=true;enable_upsert_v2=false;default_timestamp_interval=1s;force_source_table_syntax=false;persist_batch_columnar_format=structured;persist_batch_delete_enabled=true;persist_batch_structured_order=true;persist_batch_builder_structured=true;persist_batch_structured_key_lower_len=256;persist_batch_max_run_len=4;persist_catalog_force_compaction_fuel=1024;persist_catalog_force_compaction_wait=1s;persist_stats_audit_percent=100;persist_stats_audit_panic=true;persist_encoding_enable_dictionary=true;persist_fast_path_limit=1000;persist_fast_path_order=true;persist_gc_use_active_gc=true;persist_gc_min_versions=16;persist_gc_max_versions=128000;persist_inline_writes_single_max_bytes=4096;persist_inline_writes_total_max_bytes=1048576;persist_pubsub_client_enabled=true;persist_pubsub_push_diff_enabled=true;persist_record_compactions=true;persist_record_schema_id=true;persist_rollup_use_active_rollup=true;persist_blob_target_size=16777216;persist_compaction_memory_bound_bytes=83886080;persist_enable_incremental_compaction=true;persist_use_critical_since_catalog=true;persist_use_critical_since_snapshot=false;persist_use_critical_since_source=false;persist_part_decode_format=arrow;persist_blob_cache_scale_with_threads=true;persist_state_update_lease_timeout=1s;arrangement_size_history_collection_interval=1h;arrangement_size_history_retention_period=7d;persist_validate_part_bounds_on_read=false;persist_validate_part_bounds_on_write=false;statement_logging_default_sample_rate=1.0;statement_logging_max_data_credit=;statement_logging_max_sample_rate=1.0;statement_logging_target_data_rate=;storage_reclock_to_latest=true;storage_source_decode_fuel=100000;storage_statistics_collection_interval=1000;storage_statistics_interval=2000;storage_use_continual_feedback_upsert=true;default_cluster_replication_factor=1
     - MZ_NO_EXTERNAL_CLUSTERD=1
     - MZ_TIMESTAMP_ORACLE_URL=postgres://root@postgres-metadata:26257?options=--search_path=tsoracle
     - MZ_NO_BUILTIN_POSTGRES=1
@@ -307,7 +319,7 @@ services:
       start_period: 600s
     stop_grace_period: 120s
     platform: linux/amd64
-    image: materialize-materialized:latest
+    image: ghcr.io/materializeinc/materialize/materialized:mzbuild-IIJDLZ77L7R7ZOGPATZYUWRESDF5LDN7
   workload:
     depends_on:
       materialized:
@@ -321,7 +333,7 @@ services:
     - KAFKA_BROKER=kafka:9092
     - SCHEMA_REGISTRY_URL=http://schema-registry:8081
     platform: linux/amd64
-    image: materialize-workload:latest
+    image: ghcr.io/materializeinc/materialize/antithesis-workload:mzbuild-4ENC44FVTZ7WPGVUTKUVI5N7CMOJS2O2
 networks: {}
 volumes:
   mzdata: null
diff --git a/test/antithesis/export-compose.py b/test/antithesis/export-compose.py
index 80b20b2d9e9f6..081ce78ed41db 100644
--- a/test/antithesis/export-compose.py
+++ b/test/antithesis/export-compose.py
@@ -11,102 +11,199 @@
 
 """Export the resolved docker-compose YAML for the Antithesis composition.
 
-Loads the mzcompose composition and dumps the compose dict to stdout as
-YAML — without building any images or requiring a running Docker daemon.
+Loads `test/antithesis/mzcompose.py`, resolves every `mzbuild:` reference,
+and dumps the resulting docker-compose dict to stdout. Antithesis pulls the
+referenced images directly from public GHCR — no separate registry, no
+re-tagging.
 
-mzbuild references are replaced with public images where possible,
-or local tags for images that must be built (e.g. the workload).
+Image-reference policy:
+
+  * Materialize-built images (`materialized`, `antithesis-workload`) are
+    emitted as `ghcr.io/materializeinc/materialize/<name>:mzbuild-<fp>`.
+    The fingerprint participates in `antithesis=True` so antithesis builds
+    don't collide with regular builds.
+
+  * Third-party `mzbuild` images (`postgres`, `minio`) are replaced with the
+    public upstream image. Our mzbuild variants bake in test-friendly
+    patches (eatmydata, no_fsync) that defeat Antithesis's fault injection;
+    Antithesis runs against vanilla.
+
+The script also strips mzcompose-only keys, host bind-mounts, and host-path
+env vars that don't resolve inside the Antithesis sandbox, and inlines the
+postgres bootstrap SQL into the entrypoint (the bind-mount path won't
+exist).
 
 Usage:
-    bin/pyactivate test/antithesis/export-compose.py > antithesis/config/docker-compose.yaml
+    bin/pyactivate test/antithesis/export-compose.py \\
+        > test/antithesis/config/docker-compose.yaml
 """
 
 import sys
 from pathlib import Path
+from typing import Any
 
 import yaml
 
+from materialize import MZ_ROOT
 from materialize.mzbuild import Repository
 from materialize.mzcompose.composition import Composition
 
-# Map mzbuild names → image references for the Antithesis compose.
-# Public images for infra; local build tag for the workload.
-MZBUILD_TO_IMAGE = {
-    "materialized": "materialize-materialized:latest",
+# mzbuild image names that we publish to GHCR and want Antithesis to pull
+# under our fingerprint. Everything else falls back to a public image.
+MATERIALIZE_IMAGES = {"materialized", "antithesis-workload"}
+
+# Public-image fallbacks for mzbuild images whose Materialize-specific
+# customizations subvert Antithesis (eatmydata, fsync no-ops, etc.).
+PUBLIC_FALLBACKS = {
     "postgres": "postgres:17.7",
     "minio": "minio/minio:latest",
-    "antithesis-workload": "materialize-workload:latest",
 }
 
-repo = Repository(Path("."), arch="x86_64")
-c = Composition(repo, "antithesis", munge_services=False)
-
-for name, svc in c.compose["services"].items():
-    svc["platform"] = "linux/amd64"
-
-    if "mzbuild" in svc:
-        mzbuild_name = svc.pop("mzbuild")
-        if mzbuild_name not in MZBUILD_TO_IMAGE:
-            print(
-                f"warning: no image mapping for mzbuild {mzbuild_name!r}, "
-                f"using {mzbuild_name}:latest",
-                file=sys.stderr,
-            )
-            svc["image"] = f"{mzbuild_name}:latest"
-        else:
-            svc["image"] = MZBUILD_TO_IMAGE[mzbuild_name]
-
-    # Fixups for vanilla postgres (the mzbuild image has eatmydata, custom
-    # pg_hba.conf, and baked-in init SQL — none of which exist in the public image).
-    if svc.get("image", "").startswith("postgres:"):
-        env = svc.get("environment", [])
-        # Remove eatmydata — not installed in vanilla postgres
-        env[:] = [e for e in env if not e.startswith("LD_PRELOAD=")]
-        # Trust auth so materialized can connect as root without a password
-        env.append("POSTGRES_HOST_AUTH_METHOD=trust")
-        # Remove host bind-mount for setup SQL — won't exist in Antithesis.
-        # Instead, inline the init SQL that creates the schemas materialized needs.
-        vols = svc.get("volumes", [])
-        vols[:] = [v for v in vols if "setup_materialize.sql" not in v]
-        if not vols:
-            del svc["volumes"]
-        # Inline the init SQL as a script volume
-        init_sql = (
-            "CREATE ROLE root WITH LOGIN PASSWORD 'root';"
-            "CREATE DATABASE root;"
-            "GRANT ALL PRIVILEGES ON DATABASE root TO root;"
-            r"\c root;"
-            "CREATE SCHEMA IF NOT EXISTS consensus AUTHORIZATION root;"
-            "CREATE SCHEMA IF NOT EXISTS adapter AUTHORIZATION root;"
-            "CREATE SCHEMA IF NOT EXISTS storage AUTHORIZATION root;"
-            "CREATE SCHEMA IF NOT EXISTS tsoracle AUTHORIZATION root;"
-            "GRANT ALL PRIVILEGES ON SCHEMA public TO root;"
+# Header prepended to the generated YAML so check-copyright passes and
+# readers know the file isn't hand-edited.
+HEADER = """\
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+# GENERATED FILE — do not edit. Regenerate via:
+#   bin/pyactivate test/antithesis/export-compose.py > test/antithesis/config/docker-compose.yaml
+# Source of truth: test/antithesis/mzcompose.py.
+
+"""
+
+
+def resolve_mzbuild(svc: dict[str, Any], deps: Any) -> None:
+    """Replace `mzbuild:` with a concrete `image:` ref."""
+    name = svc.pop("mzbuild")
+    if name in MATERIALIZE_IMAGES:
+        svc["image"] = deps[name].spec()
+    elif name in PUBLIC_FALLBACKS:
+        svc["image"] = PUBLIC_FALLBACKS[name]
+    else:
+        raise ValueError(
+            f"mzbuild image {name!r} has no Antithesis policy — add it to "
+            f"MATERIALIZE_IMAGES (use our GHCR build) or PUBLIC_FALLBACKS "
+            f"(swap to a public image) in export-compose.py."
         )
-        svc.setdefault("entrypoint", [])
-        svc["entrypoint"] = ["sh", "-c", f"""
-echo "{init_sql}" > /docker-entrypoint-initdb.d/z_setup_materialize.sql
-exec docker-entrypoint.sh "$$@"
-""".strip(), "--"]
-
-    # Strip host bind-mounts — they won't resolve in Antithesis
-    if "volumes" in svc:
-        svc["volumes"] = [
-            v for v in svc["volumes"]
-            if not isinstance(v, str) or ":" not in v or not v.split(":")[0].startswith("/")
-        ]
-        if not svc["volumes"]:
-            del svc["volumes"]
-
-    # Remove env vars that point at host-only paths (the Docker image
-    # entrypoint provides sensible defaults when these are unset)
-    if "environment" in svc:
-        svc["environment"] = [
-            e for e in svc["environment"]
-            if not e.startswith(("MZ_LISTENERS_CONFIG_PATH=", "MZ_EXTERNAL_LOGIN_PASSWORD_"))
-        ]
-
-    # Drop mzcompose-only keys that docker/podman compose doesn't understand
-    for key in ["propagate_uid_gid", "allow_host_ports", "publish"]:
+
+
+def inline_postgres_setup(svc: dict[str, Any]) -> None:
+    """Replace the bind-mounted setup SQL with an inline entrypoint write.
+
+    Antithesis has no host filesystem, so we can't mount the SQL file.
+    Read it from misc/postgres/setup_materialize.sql (one source of truth)
+    and bake it into the service entrypoint.
+    """
+    if not svc.get("image", "").startswith("postgres:"):
+        return
+
+    env = svc.setdefault("environment", [])
+    # eatmydata isn't installed in the public postgres image.
+    env[:] = [e for e in env if not e.startswith("LD_PRELOAD=")]
+    # Trust auth — Antithesis-internal traffic only.
+    env.append("POSTGRES_HOST_AUTH_METHOD=trust")
+
+    # Drop the bind-mounted setup SQL; we'll inline it.
+    vols = svc.get("volumes", [])
+    vols[:] = [v for v in vols if "setup_materialize.sql" not in v]
+    if not vols:
+        svc.pop("volumes", None)
+
+    setup_sql = (MZ_ROOT / "misc" / "postgres" / "setup_materialize.sql").read_text()
+    # Strip comment lines + collapse to one statement per output line so we
+    # can safely double-quote it inside the sh -c here.
+    setup_sql = "\n".join(
+        line for line in setup_sql.splitlines() if line and not line.startswith("--")
+    )
+    svc["entrypoint"] = [
+        "sh",
+        "-c",
+        # `$$@` survives compose's $-interpolation and arrives as `$@` at the
+        # shell, forwarding any args (e.g., the `postgres` CMD) verbatim.
+        f"cat <<'SQL' > /docker-entrypoint-initdb.d/z_setup_materialize.sql\n"
+        f"{setup_sql}\n"
+        f"SQL\n"
+        f'exec docker-entrypoint.sh "$$@"',
+        "--",
+    ]
+
+
+def strip_host_bindmounts(svc: dict[str, Any]) -> None:
+    """Drop volume entries that bind-mount a host path."""
+    if "volumes" not in svc:
+        return
+    svc["volumes"] = [
+        v
+        for v in svc["volumes"]
+        if not isinstance(v, str)
+        or ":" not in v
+        or not v.split(":", 1)[0].startswith("/")
+    ]
+    if not svc["volumes"]:
+        del svc["volumes"]
+
+
+def strip_incompatible_env(svc: dict[str, Any]) -> None:
+    """Drop env vars that are unsafe or unresolvable under Antithesis.
+
+    - `MZ_EAT_MY_DATA` enables `libeatmydata.so` (fsync no-op) — fatal for
+      crash-recovery testing under fault injection.
+    - `MZ_LISTENERS_CONFIG_PATH` and `MZ_EXTERNAL_LOGIN_PASSWORD_*` reference
+      host paths or host secrets that don't exist in the sandbox.
+    - Bare env vars (no `=`) inherit from the host environment, which is
+      empty under Antithesis; drop them so materialized's built-in defaults
+      apply.
+    """
+    if "environment" not in svc:
+        return
+    drop_prefixes = (
+        "MZ_EAT_MY_DATA=",
+        "MZ_LISTENERS_CONFIG_PATH=",
+        "MZ_EXTERNAL_LOGIN_PASSWORD_",
+    )
+    svc["environment"] = [
+        e for e in svc["environment"] if "=" in e and not e.startswith(drop_prefixes)
+    ]
+
+
+def strip_mzcompose_keys(svc: dict[str, Any]) -> None:
+    """Drop keys understood by mzcompose but not by docker/podman compose."""
+    for key in ("propagate_uid_gid", "allow_host_ports", "publish"):
         svc.pop(key, None)
 
-yaml.dump(c.compose, sys.stdout, default_flow_style=False, sort_keys=False)
+
+def main() -> None:
+    # munge_services=False keeps ports bare (e.g., `6875` instead of
+    # `127.0.0.1::6875`) — Antithesis is container-to-container, no host
+    # binding. We do our own mzbuild→image substitution below.
+    repo = Repository(Path("."), arch="x86_64", antithesis=True)
+    c = Composition(repo, "antithesis", munge_services=False)
+
+    images = [
+        repo.images[svc["mzbuild"]]
+        for svc in c.compose["services"].values()
+        if "mzbuild" in svc
+    ]
+    deps = repo.resolve_dependencies(images)
+
+    for svc in c.compose["services"].values():
+        svc["platform"] = "linux/amd64"
+        if "mzbuild" in svc:
+            resolve_mzbuild(svc, deps)
+        inline_postgres_setup(svc)
+        strip_host_bindmounts(svc)
+        strip_incompatible_env(svc)
+        strip_mzcompose_keys(svc)
+
+    sys.stdout.write(HEADER)
+    yaml.dump(c.compose, sys.stdout, default_flow_style=False, sort_keys=False)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/antithesis/mzcompose.py b/test/antithesis/mzcompose.py
index d84b0f0108bd5..c5320b38b0f80 100644
--- a/test/antithesis/mzcompose.py
+++ b/test/antithesis/mzcompose.py
@@ -18,15 +18,11 @@
   - workload: Python test driver with Antithesis SDK
 
 Usage:
-  bin/mzcompose --find antithesis run default        # bring up the cluster
-  bin/mzcompose --find antithesis run export-compose  # dump compose YAML
+  bin/mzcompose --find antithesis run default                       # bring up the cluster
+  bin/pyactivate test/antithesis/export-compose.py > config/...     # dump compose YAML
 """
 
-import sys
-
-import yaml
-
-from materialize.mzcompose.composition import Composition, WorkflowArgumentParser
+from materialize.mzcompose.composition import Composition
 from materialize.mzcompose.service import Service, ServiceConfig
 from materialize.mzcompose.services.materialized import Materialized
 from materialize.mzcompose.services.minio import Minio
@@ -76,13 +72,3 @@ def workflow_default(c: Composition) -> None:
     c.up("postgres-metadata", "minio", "redpanda")
     c.up("materialized")
     c.up("workload")
-
-
-def workflow_export_compose(c: Composition) -> None:
-    """Export the resolved docker-compose YAML to stdout.
-
-    Usage:
-      bin/mzcompose --find antithesis run export-compose > antithesis/config/docker-compose.yaml
-    """
-    # c.compose is the fully-resolved compose dict (mzbuild: replaced with image:)
-    yaml.dump(c.compose, sys.stdout, default_flow_style=False, sort_keys=False)

From d7cc3c466c85a42e82bb78fa97bfd5d4052295fc Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Mon, 11 May 2026 14:02:50 -0400
Subject: [PATCH 08/65] test/antithesis: strip Makefile to mzbuild-driven build
 (drop registry/retag hacks)

---
 test/antithesis/Makefile | 138 +++++++++++++++------------------------
 1 file changed, 52 insertions(+), 86 deletions(-)

diff --git a/test/antithesis/Makefile b/test/antithesis/Makefile
index bee47a5d6f2c6..f25077b582d33 100644
--- a/test/antithesis/Makefile
+++ b/test/antithesis/Makefile
@@ -1,20 +1,35 @@
-# Build / run helper for the Materialize Antithesis harness.
+# Copyright Materialize, Inc. and contributors. All rights reserved.
 #
-# Usage:
-#   make build                 # build every local image
-#   make up                    # export compose, build, bring up the stack
-#   make test                  # smoke test against the running cluster
-#   make push                  # push locally-built images to Antithesis registry
-#   make down                  # tear down (preserves volumes)
-#   make clean                 # tear down + remove volumes + images
-#   make smoke                 # full cycle: build → up → test
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+# Local-dev helper for the Materialize Antithesis harness.
+#
+# Antithesis images ship via the standard mzbuild → GHCR flow; CI publishes
+# the same images CI publishes for everything else, fingerprint-tagged with
+# `mzbuild-<fp>`. Locally, we just acquire the mzbuild images, regenerate
+# the compose YAML, and let `docker compose` find them by their canonical
+# spec.
+#
+# Targets:
+#   make build    # regenerate compose YAML, acquire local mzbuild images
+#   make up       # build + bring up the stack
+#   make down     # tear down (preserves volumes)
+#   make smoke    # build + up + smoke test
+#   make test     # smoke test against a running stack
+#   make clean    # tear down + remove volumes
 
 SHELL := /usr/bin/env bash
 .SHELLFLAGS := -eu -o pipefail -c
 
-PROJECT   := materialize
+PROJECT   := materialize-antithesis
 REPO_ROOT := $(realpath $(dir $(lastword $(MAKEFILE_LIST)))/../..)
 
+# Pick podman if available, else docker.
 ifndef RUNTIME
   RUNTIME := $(shell command -v podman >/dev/null 2>&1 && echo podman || (command -v docker >/dev/null 2>&1 && echo docker || echo none))
 endif
@@ -25,104 +40,55 @@ ifeq ($(RUNTIME),podman)
   export MZ_DEV_CI_BUILDER_RUNTIME := podman
 endif
 
-COMPOSE := $(RUNTIME) compose -p $(PROJECT) -f config/docker-compose.yaml
-PSQL    := $(COMPOSE) exec materialized psql -h localhost -p 6875 -U materialize
+COMPOSE_FILE := $(REPO_ROOT)/test/antithesis/config/docker-compose.yaml
+COMPOSE      := $(RUNTIME) compose -p $(PROJECT) -f $(COMPOSE_FILE)
+PSQL         := $(COMPOSE) exec materialized psql -h localhost -p 6875 -U materialize
 
-REGISTRY      ?= us-central1-docker.pkg.dev
-REGISTRY_PATH ?= /molten-verve-216720/materialize-repository
+# mzbuild images we need built locally. Third-party images (postgres, minio,
+# redpanda) are pulled by `docker compose` from their upstream registries.
+MZBUILD_IMAGES := materialized antithesis-workload
 
 # ---------------------------------------------------------------------------
-# Export — generate the resolved docker-compose YAML for Antithesis.
+# Build
 # ---------------------------------------------------------------------------
-.PHONY: export-compose
+.PHONY: build export-compose acquire-images
+
+build: export-compose acquire-images
+
 export-compose:
-	cd $(REPO_ROOT) && $(REPO_ROOT)/bin/pyactivate test/antithesis/export-compose.py > test/antithesis/config/docker-compose.yaml
-	@echo "Wrote test/antithesis/config/docker-compose.yaml"
+	cd $(REPO_ROOT) && bin/pyactivate test/antithesis/export-compose.py > $(COMPOSE_FILE)
+	@echo "Wrote $(COMPOSE_FILE)"
 
-# ---------------------------------------------------------------------------
-# Build — build images that don't have public equivalents.
-# ---------------------------------------------------------------------------
-LOCAL_IMAGES  := materialized workload config
-BUILD_TARGETS := $(LOCAL_IMAGES:%=build-%)
-
-.PHONY: build $(BUILD_TARGETS) build-builder
-build: export-compose $(BUILD_TARGETS)
-
-build-builder:
-	cd $(REPO_ROOT) && $(REPO_ROOT)/bin/ci-builder build stable --load
-	@tag=$$(cd $(REPO_ROOT) && $(REPO_ROOT)/bin/ci-builder tag stable); \
-	  echo "Tagging materialize/ci-builder:$$tag -> $(PROJECT)-builder:latest"; \
-	  $(RUNTIME) tag "materialize/ci-builder:$$tag" $(PROJECT)-builder:latest
-
-build-materialized: build-builder
-	cd $(REPO_ROOT) && $(REPO_ROOT)/bin/mzimage acquire materialized --antithesis
-	@# Tag the mzbuild output to the name the compose file expects
-	@img=$$($(RUNTIME) images --format '{{.Repository}}:{{.Tag}}' \
-	  | grep 'materialized:mzbuild-' | head -1); \
-	  echo "Tagging $$img -> $(PROJECT)-materialized:latest"; \
-	  $(RUNTIME) tag "$$img" $(PROJECT)-materialized:latest
-
-build-workload:
-	$(RUNTIME) build \
-	  --platform linux/amd64 \
-	  -t $(PROJECT)-workload:latest \
-	  $(REPO_ROOT)/test/antithesis/workload
-
-build-config: export-compose
-	$(RUNTIME) build \
-	  --platform linux/amd64 \
-	  -t $(PROJECT)-config:latest \
-	  config
+acquire-images:
+	@for image in $(MZBUILD_IMAGES); do \
+	  echo "--- Acquiring $$image (--antithesis)"; \
+	  cd $(REPO_ROOT) && bin/mzimage acquire "$$image" --antithesis; \
+	done
 
 # ---------------------------------------------------------------------------
 # Up / Down
 # ---------------------------------------------------------------------------
-.PHONY: up
-up: export-compose build
+.PHONY: up down clean
+
+up: build
 	$(COMPOSE) up -d
 
-.PHONY: down
 down:
 	$(COMPOSE) down
 
+clean: down
+	$(COMPOSE) down -v --remove-orphans 2>/dev/null || true
+
 # ---------------------------------------------------------------------------
-# Test — quick smoke test against the running cluster
+# Test
 # ---------------------------------------------------------------------------
-.PHONY: test
+.PHONY: test smoke
+
 test:
 	$(PSQL) -c "CREATE TABLE IF NOT EXISTS smoke_test (k INT, v TEXT)"
 	$(PSQL) -c "INSERT INTO smoke_test VALUES (1, 'hello'), (2, 'world')"
 	$(PSQL) -c "SELECT * FROM smoke_test ORDER BY k"
 	$(PSQL) -c "DROP TABLE smoke_test"
 
-# ---------------------------------------------------------------------------
-# Push — tag local images and push to the Antithesis registry
-# ---------------------------------------------------------------------------
-.PHONY: push
-push:
-	@$(RUNTIME) images --format '{{.Repository}}:{{.Tag}}' \
-	  | grep ':latest$$' | grep '^\(localhost/\)\?$(PROJECT)-' \
-	  | grep -v '$(PROJECT)-builder' \
-	  | while read item; do \
-	      nametag="$${item#localhost/}"; \
-	      name="$${nametag%:*}"; \
-	      remote="$(REGISTRY)$(REGISTRY_PATH)/$${name}:latest"; \
-	      echo "Pushing $${item} -> $${remote}"; \
-	      $(RUNTIME) tag "$${item}" "$${remote}" || exit 1; \
-	      $(RUNTIME) push "$${remote}" || exit 1; \
-	  done
-
-# ---------------------------------------------------------------------------
-# Clean
-# ---------------------------------------------------------------------------
-.PHONY: clean
-clean: down
-	$(COMPOSE) down -v --remove-orphans 2>/dev/null || true
-	-$(RUNTIME) rmi $$($(RUNTIME) images --format '{{.Repository}}:{{.Tag}}' | grep '^$(PROJECT)-' || true) 2>/dev/null
-
-# ---------------------------------------------------------------------------
-# Smoke — full cycle: build → up → test
-# ---------------------------------------------------------------------------
-.PHONY: smoke
 smoke: up test
 	@echo "[smoke] passed"

From 106c9a9feb3541232d6dd2c6e3b41fa879ec3e09 Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Mon, 11 May 2026 14:03:43 -0400
Subject: [PATCH 09/65] ci: nightly antithesis builds via CI_ANTITHESIS env
 passthrough

---
 ci/mkpipeline.py                           | 32 ++++++++++++++++++++++
 ci/nightly/pipeline.template.yml           | 19 +++++++++++++
 ci/test/build.py                           |  2 ++
 ci/test/lint-main/checks/check-pipeline.sh |  1 +
 4 files changed, 54 insertions(+)

diff --git a/ci/mkpipeline.py b/ci/mkpipeline.py
index 79fcb7bd2a0c9..d6be6018c7532 100644
--- a/ci/mkpipeline.py
+++ b/ci/mkpipeline.py
@@ -121,6 +121,12 @@ def main() -> int:
         type=Sanitizer,
         choices=Sanitizer,
     )
+    parser.add_argument(
+        "--antithesis",
+        action="store_true",
+        default=ui.env_is_truthy("CI_ANTITHESIS"),
+        help="enable Antithesis coverage instrumentation",
+    )
     parser.add_argument(
         "--priority",
         type=int,
@@ -166,6 +172,7 @@ def get_hashes(arch: Arch) -> tuple[str, bool]:
             arch=arch,
             coverage=args.coverage,
             sanitizer=args.sanitizer,
+            antithesis=args.antithesis,
         )
         deps = repo.resolve_dependencies(image for image in repo if image.publish)
         check = deps.check()
@@ -209,6 +216,7 @@ def fetch_hashes() -> None:
                 args.coverage,
                 args.sanitizer,
                 lto,
+                args.antithesis,
             )
             trim_ci_glue_exempt_steps(pipeline)
         else:
@@ -218,9 +226,11 @@ def fetch_hashes() -> None:
                 args.coverage,
                 args.sanitizer,
                 lto,
+                args.antithesis,
             )
     truncate_skip_length(pipeline)
     handle_sanitizer_skip(pipeline, args.sanitizer)
+    handle_antithesis_skip(pipeline, args.antithesis)
     increase_agents_timeouts(pipeline, args.sanitizer, args.coverage)
     prioritize_pipeline(pipeline, args.priority)
     switch_jobs_to_aws(pipeline, args.priority)
@@ -240,6 +250,7 @@ def fetch_hashes() -> None:
         args.coverage,
         args.sanitizer,
         lto,
+        args.antithesis,
     )
     add_nightly_deploy_dependency(pipeline, args.pipeline)
     remove_dependencies_on_prs(pipeline, args.pipeline, hash_check)
@@ -328,6 +339,21 @@ def handle_sanitizer_skip(pipeline: Any, sanitizer: Sanitizer) -> None:
                 step["skip"] = True
 
 
+def handle_antithesis_skip(pipeline: Any, antithesis: bool) -> None:
+    if antithesis:
+        pipeline.setdefault("env", {})["CI_ANTITHESIS"] = "1"
+
+        for step in steps(pipeline):
+            if step.get("antithesis") == "skip":
+                step["skip"] = True
+
+    else:
+
+        for step in steps(pipeline):
+            if step.get("antithesis") == "only":
+                step["skip"] = True
+
+
 def increase_agents_timeouts(
     pipeline: Any, sanitizer: Sanitizer, coverage: bool
 ) -> None:
@@ -711,6 +737,7 @@ def trim_tests_pipeline(
     coverage: bool,
     sanitizer: Sanitizer,
     lto: bool,
+    antithesis: bool = False,
 ) -> None:
     """Trim pipeline steps whose inputs have not changed in this branch.
 
@@ -731,6 +758,7 @@ def trim_tests_pipeline(
         profile=mzbuild.Profile.RELEASE if lto else mzbuild.Profile.OPTIMIZED,
         coverage=coverage,
         sanitizer=sanitizer,
+        antithesis=antithesis,
     )
     deps = repo.resolve_dependencies(image for image in repo)
 
@@ -917,6 +945,7 @@ def add_cargo_test_dependency(
     coverage: bool,
     sanitizer: Sanitizer,
     lto: bool,
+    antithesis: bool = False,
 ) -> None:
     """Cargo Test normally doesn't have to wait for the build to complete, but it requires a few images (ubuntu-base, postgres), which are rarely changed. So only add a dependency when those images are not on Dockerhub yet."""
     if pipeline_name not in ("test", "nightly"):
@@ -933,6 +962,7 @@ def add_cargo_test_dependency(
         profile=mzbuild.Profile.RELEASE if lto else mzbuild.Profile.OPTIMIZED,
         coverage=coverage,
         sanitizer=sanitizer,
+        antithesis=antithesis,
     )
     composition = Composition(repo, name="cargo-test")
     deps = composition.dependencies
@@ -1090,6 +1120,8 @@ def remove_mz_specific_keys(pipeline: Any) -> None:
             del step["coverage"]
         if "sanitizer" in step:
             del step["sanitizer"]
+        if "antithesis" in step:
+            del step["antithesis"]
         if "ci_glue_exempt" in step:
             del step["ci_glue_exempt"]
         if (
diff --git a/ci/nightly/pipeline.template.yml b/ci/nightly/pipeline.template.yml
index d10055451b451..f1cf3e4c7ed69 100644
--- a/ci/nightly/pipeline.template.yml
+++ b/ci/nightly/pipeline.template.yml
@@ -65,6 +65,25 @@ steps:
         branches: "main"
         skip: "currently broken"
 
+      - id: build-x86_64-antithesis
+        label: ":rust: Build x86_64 (Antithesis)"
+        command: bin/ci-builder run stable bin/pyactivate -m ci.test.build
+        inputs:
+          - "*"
+        depends_on: []
+        timeout_in_minutes: 90
+        agents:
+          queue: l-builder-linux-x86_64
+        env:
+          CI_ANTITHESIS: "1"
+        # Antithesis-flavored images get distinct mzbuild fingerprints, so
+        # they coexist with regular GHCR tags. The build is x86_64-only —
+        # Antithesis runs amd64 sandboxes.
+        sanitizer: skip
+        coverage: skip
+        antithesis: skip
+        branches: "main"
+
       - id: build-rust-latest-beta
         label: "Build with Latest Rust Beta"
         command: bin/ci-builder run stable ci/test/rust-beta-build.sh
diff --git a/ci/test/build.py b/ci/test/build.py
index d91e82ffe2734..5b18ce91e9b31 100755
--- a/ci/test/build.py
+++ b/ci/test/build.py
@@ -34,11 +34,13 @@ def main() -> None:
         set_build_status("pending")
         coverage = ui.env_is_truthy("CI_COVERAGE_ENABLED")
         sanitizer = Sanitizer[os.getenv("CI_SANITIZER", "none")]
+        antithesis = ui.env_is_truthy("CI_ANTITHESIS")
 
         repo = mzbuild.Repository(
             Path("."),
             coverage=coverage,
             sanitizer=sanitizer,
+            antithesis=antithesis,
             image_registry="materialize",
         )
 
diff --git a/ci/test/lint-main/checks/check-pipeline.sh b/ci/test/lint-main/checks/check-pipeline.sh
index baed7ae9a717c..95da47ae547c8 100755
--- a/ci/test/lint-main/checks/check-pipeline.sh
+++ b/ci/test/lint-main/checks/check-pipeline.sh
@@ -28,6 +28,7 @@ unset CI_TEST_IDS
 unset CI_TEST_SELECTION
 unset CI_SANITIZER
 unset CI_COVERAGE_ENABLED
+unset CI_ANTITHESIS
 unset CI_WAITING_FOR_BUILD
 
 pids=()

From e0214600b7c9497a71b82508a106e050e1b32082 Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Mon, 11 May 2026 14:13:50 -0400
Subject: [PATCH 10/65] ci: lint check that test/antithesis compose YAML
 matches mzcompose.py

---
 .../checks/check-antithesis-compose.sh        | 50 +++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100755 ci/test/lint-main/checks/check-antithesis-compose.sh

diff --git a/ci/test/lint-main/checks/check-antithesis-compose.sh b/ci/test/lint-main/checks/check-antithesis-compose.sh
new file mode 100755
index 0000000000000..add2f2a0dab57
--- /dev/null
+++ b/ci/test/lint-main/checks/check-antithesis-compose.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+#
+# check-antithesis-compose.sh — ensure test/antithesis/config/docker-compose.yaml
+# is in sync with test/antithesis/mzcompose.py.
+#
+# Fingerprint refs (`mzbuild-<fp>`) shift on every materialized code change,
+# so we mask them before diffing — we only want to catch composition
+# (services, ports, env, deps) drift, not transient fingerprint churn.
+
+set -euo pipefail
+
+cd "$(dirname "$0")/../../../.."
+
+. misc/shlib/shlib.bash
+
+check_antithesis_compose() {
+    local committed=test/antithesis/config/docker-compose.yaml
+    local generated rc=0
+    generated=$(mktemp)
+
+    bin/pyactivate test/antithesis/export-compose.py > "$generated"
+
+    # Mask `mzbuild-<FINGERPRINT>` so the diff is structural-only.
+    local mask='s/(mzbuild-)[A-Z0-9]+/\1FINGERPRINT/g'
+    if ! diff -u \
+        <(sed -E "$mask" "$committed") \
+        <(sed -E "$mask" "$generated"); then
+        echo
+        echo "$committed is out of sync with test/antithesis/mzcompose.py."
+        echo "Regenerate with:"
+        echo "  bin/pyactivate test/antithesis/export-compose.py > $committed"
+        rc=1
+    fi
+
+    rm -f "$generated"
+    return $rc
+}
+
+try check_antithesis_compose
+
+try_status_report

From ff5c6d79b14d129df416bed54f7f87139e75ff8d Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Mon, 11 May 2026 14:24:08 -0400
Subject: [PATCH 11/65] ci: drop branches:main on build-x86_64-antithesis
 (validating)

---
 ci/nightly/pipeline.template.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ci/nightly/pipeline.template.yml b/ci/nightly/pipeline.template.yml
index f1cf3e4c7ed69..065fbe0488b6f 100644
--- a/ci/nightly/pipeline.template.yml
+++ b/ci/nightly/pipeline.template.yml
@@ -82,7 +82,6 @@ steps:
         sanitizer: skip
         coverage: skip
         antithesis: skip
-        branches: "main"
 
       - id: build-rust-latest-beta
         label: "Build with Latest Rust Beta"

From 0f59e7d82451b64c4c9a4d4b9ddb40666981d602 Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Mon, 11 May 2026 14:31:08 -0400
Subject: [PATCH 12/65] test/antithesis: switch to Kafka stack + external
 clusterd

---
 test/antithesis/config/docker-compose.yaml    | 147 ++++++++++++++----
 test/antithesis/mzcompose.py                  |  50 ++++--
 .../workload/workload-entrypoint.sh           |  33 +++-
 3 files changed, 188 insertions(+), 42 deletions(-)

diff --git a/test/antithesis/config/docker-compose.yaml b/test/antithesis/config/docker-compose.yaml
index dc75d1e5f2ef8..26819190cd164 100644
--- a/test/antithesis/config/docker-compose.yaml
+++ b/test/antithesis/config/docker-compose.yaml
@@ -93,44 +93,123 @@ services:
       start_period: 30s
     platform: linux/amd64
     image: minio/minio:latest
-  redpanda:
-    image: redpandadata/redpanda:v25.2.11
+  zookeeper:
+    image: confluentinc/cp-zookeeper:7.9.4
+    ports:
+    - 2181
+    environment:
+    - ZOOKEEPER_CLIENT_PORT=2181
+    healthcheck:
+      test:
+      - CMD
+      - nc
+      - -z
+      - localhost
+      - '2181'
+      interval: 1s
+      start_period: 120s
+    platform: linux/amd64
+  kafka:
+    image: confluentinc/cp-kafka:7.9.4
+    ports:
+    - '9092'
+    environment:
+    - KAFKA_ZOOKEEPER_CONNECT=zookeeper:2181
+    - KAFKA_CONFLUENT_SUPPORT_METRICS_ENABLE=false
+    - KAFKA_MIN_INSYNC_REPLICAS=1
+    - KAFKA_OFFSETS_TOPIC_NUM_PARTITIONS=1
+    - KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR=1
+    - KAFKA_TRANSACTION_STATE_LOG_MIN_ISR=1
+    - KAFKA_MESSAGE_MAX_BYTES=15728640
+    - KAFKA_REPLICA_FETCH_MAX_BYTES=15728640
+    - KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS=100
+    - KAFKA_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092
+    - KAFKA_BROKER_ID=1
+    - KAFKA_AUTO_CREATE_TOPICS_ENABLE=True
+    - KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR=1
+    depends_on:
+      zookeeper:
+        condition: service_started
+    healthcheck:
+      test:
+      - CMD
+      - nc
+      - -z
+      - localhost
+      - '9092'
+      interval: 1s
+      start_period: 120s
+    platform: linux/amd64
+  schema-registry:
+    image: confluentinc/cp-schema-registry:7.9.4
     ports:
-    - 9092
     - 8081
-    command:
-    - redpanda
-    - start
-    - --overprovisioned
-    - --smp=1
-    - --memory=1G
-    - --reserve-memory=0M
-    - --node-id=0
-    - --check=false
-    - --set
-    - redpanda.enable_transactions=true
-    - --set
-    - redpanda.enable_idempotence=true
-    - --set
-    - redpanda.auto_create_topics_enabled=True
-    - --set
-    - redpanda.topic_memory_per_partition=4096
-    - --set
-    - --advertise-kafka-addr=kafka:9092
     networks:
       default:
-        aliases:
-        - kafka
-        - schema-registry
+        aliases: []
+    environment:
+    - SCHEMA_REGISTRY_KAFKASTORE_TIMEOUT_MS=10000
+    - SCHEMA_REGISTRY_KAFKASTORE_TOPIC_REPLICATION_FACTOR=1
+    - SCHEMA_REGISTRY_HOST_NAME=schema-registry
+    - SCHEMA_REGISTRY_KAFKASTORE_BOOTSTRAP_SERVERS=PLAINTEXT://kafka:9092
+    command:
+    - /bin/bash
+    - -c
+    - . /etc/confluent/docker/bash-config && . /etc/confluent/docker/mesos-setup.sh
+      && . /etc/confluent/docker/apply-mesos-overrides && /etc/confluent/docker/configure
+      && exec /etc/confluent/docker/launch
+    depends_on:
+      kafka:
+        condition: service_started
     healthcheck:
       test:
       - CMD
       - curl
-      - -f
-      - localhost:9644/v1/status/ready
+      - -fu
+      - materialize:sekurity
+      - localhost:8081
       interval: 1s
       start_period: 120s
     platform: linux/amd64
+  clusterd1:
+    entrypoint:
+    - tini
+    - --
+    command:
+    - clusterd
+    - --scratch-directory=/scratch
+    ports:
+    - 2100
+    - 2101
+    - 6878
+    environment:
+    - CLUSTERD_GRPC_HOST=clusterd1
+    - CLUSTERD_USE_CTP=true
+    - MZ_SOFT_ASSERTIONS=1
+    - CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2100
+    - CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2101
+    - CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=0.0.0.0:6878
+    - CLUSTERD_SECRETS_READER=local-file
+    - CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=/mzdata/secrets
+    - LD_PRELOAD=libeatmydata.so
+    - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879
+    - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0
+    - CLUSTERD_PROCESS=0
+    - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 1, "process": 0, "addresses": ["clusterd1:2102"],
+      "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
+      false, "zero_copy_limit": null}'
+    - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 1, "process": 0, "addresses": ["clusterd1:2103"],
+      "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
+      false, "zero_copy_limit": null}'
+    volumes:
+    - mzdata:/mzdata
+    - mydata:/var/lib/mysql-files
+    - tmp:/share/tmp
+    - scratch:/scratch
+    restart: 'no'
+    stop_grace_period: 120s
+    platform: linux/amd64
+    image: ghcr.io/materializeinc/materialize/materialized:mzbuild-IIJDLZ77L7R7ZOGPATZYUWRESDF5LDN7
   materialized:
     hostname: materialized
     depends_on:
@@ -296,8 +375,7 @@ services:
     - MZ_BOOTSTRAP_DEFAULT_CLUSTER_REPLICATION_FACTOR=1
     - COCKROACH_ENGINE_MAX_SYNC_DURATION_DEFAULT=120s
     - COCKROACH_LOG_MAX_SYNC_DURATION=120s
-    - MZ_SYSTEM_PARAMETER_DEFAULT=unsafe_enable_unsafe_functions=true;allow_real_time_recency=true;constraint_based_timestamp_selection=verify;enable_compute_peek_response_stash=true;enable_0dt_deployment_panic_after_timeout=true;enable_0dt_deployment_sources=true;enable_alter_swap=true;enable_case_literal_transform=false;enable_cast_elimination=true;enable_coalesce_case_transform=true;enable_columnar_lgalloc=false;enable_columnation_lgalloc=false;enable_compute_correction_v2=true;enable_compute_logical_backpressure=true;enable_connection_validation_syntax=true;enable_copy_to_expr=true;enable_copy_from_remote=true;enable_create_table_from_source=true;enable_eager_delta_joins=true;enable_envelope_debezium_in_subscribe=true;enable_expressions_in_limit_syntax=true;enable_iceberg_sink=true;enable_introspection_subscribes=true;enable_kafka_sink_partition_by=true;enable_lgalloc=false;enable_load_generator_counter=true;enable_logical_compaction_window=true;enable_multi_worker_storage_persist_sink=true;enable_multi_replica_sources=true;enable_rbac_checks=true;enable_reduce_mfp_fusion=true;enable_refresh_every_mvs=true;enable_replacement_materialized_views=true;enable_cluster_schedule_refresh=true;enable_sql_server_source=true;enable_s3_tables_region_check=false;enable_statement_lifecycle_logging=true;enable_storage_introspection_logs=true;enable_compute_temporal_bucketing=true;enable_variadic_left_join_lowering=true;enable_worker_core_affinity=true;grpc_client_http2_keep_alive_timeout=5s;ore_overflowing_behavior=panic;unsafe_enable_table_keys=true;with_0dt_deployment_max_wait=1800s;persist_next_listen_batch_retryer_clamp=16s;persist_next_listen_batch_retryer_initial_backoff=100ms;persist_next_listen_batch_retryer_fixed_sleep=1200ms;persist_enable_arrow_lgalloc_noncc_sizes=true;persist_enable_s3_lgalloc_noncc_sizes=true;compute_correction_v2_chain_proportionality=3;compute_correction_v2_chunk_size=8192;compute_dataflow_max_inflight_bytes=134217728;compute_hydration_concurrency=2;compute_replica_expiration_offset=3d;compute_apply_column_demands=true;compute_peek_response_stash_threshold_bytes=1048576;compute_subscribe_snapshot_optimization=true;enable_compute_sync_mv_sink=true;enable_password_auth=true;enable_frontend_peek_sequencing=true;enable_frontend_subscribes=true;enable_upsert_v2=false;default_timestamp_interval=1s;force_source_table_syntax=false;persist_batch_columnar_format=structured;persist_batch_delete_enabled=true;persist_batch_structured_order=true;persist_batch_builder_structured=true;persist_batch_structured_key_lower_len=256;persist_batch_max_run_len=4;persist_catalog_force_compaction_fuel=1024;persist_catalog_force_compaction_wait=1s;persist_stats_audit_percent=100;persist_stats_audit_panic=true;persist_encoding_enable_dictionary=true;persist_fast_path_limit=1000;persist_fast_path_order=true;persist_gc_use_active_gc=true;persist_gc_min_versions=16;persist_gc_max_versions=128000;persist_inline_writes_single_max_bytes=4096;persist_inline_writes_total_max_bytes=1048576;persist_pubsub_client_enabled=true;persist_pubsub_push_diff_enabled=true;persist_record_compactions=true;persist_record_schema_id=true;persist_rollup_use_active_rollup=true;persist_blob_target_size=16777216;persist_compaction_memory_bound_bytes=83886080;persist_enable_incremental_compaction=true;persist_use_critical_since_catalog=true;persist_use_critical_since_snapshot=false;persist_use_critical_since_source=false;persist_part_decode_format=arrow;persist_blob_cache_scale_with_threads=true;persist_state_update_lease_timeout=1s;arrangement_size_history_collection_interval=1h;arrangement_size_history_retention_period=7d;persist_validate_part_bounds_on_read=false;persist_validate_part_bounds_on_write=false;statement_logging_default_sample_rate=1.0;statement_logging_max_data_credit=;statement_logging_max_sample_rate=1.0;statement_logging_target_data_rate=;storage_reclock_to_latest=true;storage_source_decode_fuel=100000;storage_statistics_collection_interval=1000;storage_statistics_interval=2000;storage_use_continual_feedback_upsert=true;default_cluster_replication_factor=1
-    - MZ_NO_EXTERNAL_CLUSTERD=1
+    - MZ_SYSTEM_PARAMETER_DEFAULT=unsafe_enable_unsafe_functions=true;allow_real_time_recency=true;constraint_based_timestamp_selection=verify;enable_compute_peek_response_stash=true;enable_0dt_deployment_panic_after_timeout=true;enable_0dt_deployment_sources=true;enable_alter_swap=true;enable_case_literal_transform=false;enable_cast_elimination=true;enable_coalesce_case_transform=true;enable_columnar_lgalloc=false;enable_columnation_lgalloc=false;enable_compute_correction_v2=true;enable_compute_logical_backpressure=true;enable_connection_validation_syntax=true;enable_copy_to_expr=true;enable_copy_from_remote=true;enable_create_table_from_source=true;enable_eager_delta_joins=true;enable_envelope_debezium_in_subscribe=true;enable_expressions_in_limit_syntax=true;enable_iceberg_sink=true;enable_introspection_subscribes=true;enable_kafka_sink_partition_by=true;enable_lgalloc=false;enable_load_generator_counter=true;enable_logical_compaction_window=true;enable_multi_worker_storage_persist_sink=true;enable_multi_replica_sources=true;enable_rbac_checks=true;enable_reduce_mfp_fusion=true;enable_refresh_every_mvs=true;enable_replacement_materialized_views=true;enable_cluster_schedule_refresh=true;enable_sql_server_source=true;enable_s3_tables_region_check=false;enable_statement_lifecycle_logging=true;enable_storage_introspection_logs=true;enable_compute_temporal_bucketing=true;enable_variadic_left_join_lowering=true;enable_worker_core_affinity=true;grpc_client_http2_keep_alive_timeout=5s;ore_overflowing_behavior=panic;unsafe_enable_table_keys=true;with_0dt_deployment_max_wait=1800s;persist_next_listen_batch_retryer_clamp=16s;persist_next_listen_batch_retryer_initial_backoff=100ms;persist_next_listen_batch_retryer_fixed_sleep=1200ms;persist_enable_arrow_lgalloc_noncc_sizes=true;persist_enable_s3_lgalloc_noncc_sizes=true;compute_correction_v2_chain_proportionality=3;compute_correction_v2_chunk_size=8192;compute_dataflow_max_inflight_bytes=134217728;compute_hydration_concurrency=2;compute_replica_expiration_offset=3d;compute_apply_column_demands=true;compute_peek_response_stash_threshold_bytes=1048576;compute_subscribe_snapshot_optimization=true;enable_compute_sync_mv_sink=true;enable_password_auth=true;enable_frontend_peek_sequencing=true;enable_frontend_subscribes=true;enable_upsert_v2=false;default_timestamp_interval=1s;force_source_table_syntax=false;persist_batch_columnar_format=structured;persist_batch_delete_enabled=true;persist_batch_structured_order=true;persist_batch_builder_structured=true;persist_batch_structured_key_lower_len=256;persist_batch_max_run_len=4;persist_catalog_force_compaction_fuel=1024;persist_catalog_force_compaction_wait=1s;persist_stats_audit_percent=100;persist_stats_audit_panic=true;persist_encoding_enable_dictionary=true;persist_fast_path_limit=1000;persist_fast_path_order=true;persist_gc_use_active_gc=true;persist_gc_min_versions=16;persist_gc_max_versions=128000;persist_inline_writes_single_max_bytes=4096;persist_inline_writes_total_max_bytes=1048576;persist_pubsub_client_enabled=true;persist_pubsub_push_diff_enabled=true;persist_record_compactions=true;persist_record_schema_id=true;persist_rollup_use_active_rollup=true;persist_blob_target_size=16777216;persist_compaction_memory_bound_bytes=83886080;persist_enable_incremental_compaction=true;persist_use_critical_since_catalog=true;persist_use_critical_since_snapshot=false;persist_use_critical_since_source=false;persist_part_decode_format=arrow;persist_blob_cache_scale_with_threads=true;persist_state_update_lease_timeout=1s;arrangement_size_history_collection_interval=1h;arrangement_size_history_retention_period=7d;persist_validate_part_bounds_on_read=false;persist_validate_part_bounds_on_write=false;statement_logging_default_sample_rate=1.0;statement_logging_max_data_credit=;statement_logging_max_sample_rate=1.0;statement_logging_target_data_rate=;storage_reclock_to_latest=true;storage_source_decode_fuel=100000;storage_statistics_collection_interval=1000;storage_statistics_interval=2000;storage_use_continual_feedback_upsert=true;default_cluster_replication_factor=1;unsafe_enable_unorchestrated_cluster_replicas=true
     - MZ_TIMESTAMP_ORACLE_URL=postgres://root@postgres-metadata:26257?options=--search_path=tsoracle
     - MZ_NO_BUILTIN_POSTGRES=1
     - MZ_NO_BUILTIN_COCKROACH=1
@@ -324,16 +402,23 @@ services:
     depends_on:
       materialized:
         condition: service_healthy
-      redpanda:
+      clusterd1:
+        condition: service_started
+      kafka:
         condition: service_healthy
+      schema-registry:
+        condition: service_started
     environment:
     - PGHOST=materialized
     - PGPORT=6875
     - PGUSER=materialize
+    - PGPORT_INTERNAL=6877
+    - PGUSER_INTERNAL=mz_system
     - KAFKA_BROKER=kafka:9092
     - SCHEMA_REGISTRY_URL=http://schema-registry:8081
+    - MZ_ANTITHESIS_CLUSTER=antithesis_cluster
     platform: linux/amd64
-    image: ghcr.io/materializeinc/materialize/antithesis-workload:mzbuild-4ENC44FVTZ7WPGVUTKUVI5N7CMOJS2O2
+    image: ghcr.io/materializeinc/materialize/antithesis-workload:mzbuild-XX2UEHO746TTSXP3JUOIMJTYD2WWEBLY
 networks: {}
 volumes:
   mzdata: null
diff --git a/test/antithesis/mzcompose.py b/test/antithesis/mzcompose.py
index c5320b38b0f80..552dd1d21e824 100644
--- a/test/antithesis/mzcompose.py
+++ b/test/antithesis/mzcompose.py
@@ -10,12 +10,15 @@
 """
 Antithesis test composition for Materialize.
 
-Defines the minimal topology needed to exercise Materialize under Antithesis:
-  - postgres-metadata: consensus/catalog store
-  - minio: S3-compatible blob storage for persist
-  - redpanda: Kafka-compatible broker for source ingestion
-  - materialized: the SUT (embedded clusterd mode)
-  - workload: Python test driver with Antithesis SDK
+Topology exercised under Antithesis:
+  - postgres-metadata : consensus/catalog/timestamp-oracle store
+  - minio             : S3-compatible blob storage for persist
+  - zookeeper + kafka : Kafka broker for source ingestion
+  - schema-registry   : Avro/Protobuf schemas for kafka sources
+  - clusterd1         : external compute+storage process — fenceable
+                        independently of materialized for fault testing
+  - materialized      : the SUT (environmentd; clusterd is external)
+  - workload          : Python test driver wired to the Antithesis SDK
 
 Usage:
   bin/mzcompose --find antithesis run default                       # bring up the cluster
@@ -24,10 +27,13 @@
 
 from materialize.mzcompose.composition import Composition
 from materialize.mzcompose.service import Service, ServiceConfig
+from materialize.mzcompose.services.clusterd import Clusterd
+from materialize.mzcompose.services.kafka import Kafka
 from materialize.mzcompose.services.materialized import Materialized
 from materialize.mzcompose.services.minio import Minio
 from materialize.mzcompose.services.postgres import PostgresMetadata
-from materialize.mzcompose.services.redpanda import Redpanda
+from materialize.mzcompose.services.schema_registry import SchemaRegistry
+from materialize.mzcompose.services.zookeeper import Zookeeper
 
 
 class Workload(Service):
@@ -38,14 +44,22 @@ def __init__(self) -> None:
             "mzbuild": "antithesis-workload",
             "depends_on": {
                 "materialized": {"condition": "service_healthy"},
-                "redpanda": {"condition": "service_healthy"},
+                "clusterd1": {"condition": "service_started"},
+                "kafka": {"condition": "service_healthy"},
+                "schema-registry": {"condition": "service_started"},
             },
             "environment": [
                 "PGHOST=materialized",
                 "PGPORT=6875",
                 "PGUSER=materialize",
+                # Internal SQL port for system-privileged setup (CREATE CLUSTER).
+                "PGPORT_INTERNAL=6877",
+                "PGUSER_INTERNAL=mz_system",
                 "KAFKA_BROKER=kafka:9092",
                 "SCHEMA_REGISTRY_URL=http://schema-registry:8081",
+                # Name of the unmanaged cluster the workload-entrypoint
+                # provisions against clusterd1 before emitting setup-complete.
+                "MZ_ANTITHESIS_CLUSTER=antithesis_cluster",
             ],
         }
         super().__init__(name="workload", config=config)
@@ -54,7 +68,10 @@ def __init__(self) -> None:
 SERVICES = [
     PostgresMetadata(),
     Minio(setup_materialize=True),
-    Redpanda(auto_create_topics=True),
+    Zookeeper(),
+    Kafka(auto_create_topics=True),
+    SchemaRegistry(),
+    Clusterd(name="clusterd1"),
     Materialized(
         external_blob_store=True,
         external_metadata_store=True,
@@ -62,6 +79,12 @@ def __init__(self) -> None:
         unsafe_mode=True,
         soft_assertions=True,
         sanity_restart=False,
+        support_external_clusterd=True,
+        # Allow creating an unmanaged cluster pointed at clusterd1 — without
+        # this, CREATE CLUSTER ... STORAGECTL ADDRESSES is rejected.
+        additional_system_parameter_defaults={
+            "unsafe_enable_unorchestrated_cluster_replicas": "true",
+        },
     ),
     Workload(),
 ]
@@ -69,6 +92,13 @@ def __init__(self) -> None:
 
 def workflow_default(c: Composition) -> None:
     """Bring up the Antithesis test cluster."""
-    c.up("postgres-metadata", "minio", "redpanda")
+    c.up(
+        "postgres-metadata",
+        "minio",
+        "zookeeper",
+        "kafka",
+        "schema-registry",
+        "clusterd1",
+    )
     c.up("materialized")
     c.up("workload")
diff --git a/test/antithesis/workload/workload-entrypoint.sh b/test/antithesis/workload/workload-entrypoint.sh
index f37eb275ab1e7..e660a7904bb46 100755
--- a/test/antithesis/workload/workload-entrypoint.sh
+++ b/test/antithesis/workload/workload-entrypoint.sh
@@ -11,13 +11,44 @@
 
 set -euo pipefail
 
-# Wait for materialized to be ready before signaling setup_complete.
+PGHOST="${PGHOST:-materialized}"
+PGPORT="${PGPORT:-6875}"
+PGUSER="${PGUSER:-materialize}"
+PGPORT_INTERNAL="${PGPORT_INTERNAL:-6877}"
+PGUSER_INTERNAL="${PGUSER_INTERNAL:-mz_system}"
+CLUSTER="${MZ_ANTITHESIS_CLUSTER:-antithesis_cluster}"
+
+# Wait for materialized to be ready.
 echo "Waiting for materialized to become healthy..."
 until curl -sf http://materialized:6878/api/readyz > /dev/null 2>&1; do
     sleep 1
 done
 echo "materialized is healthy."
 
+# Provision an unmanaged cluster backed by the external clusterd1 process.
+# This must run before setup-complete so Test Composer assertions can target
+# the cluster from the start. Idempotent — `IF NOT EXISTS` is unsupported on
+# `CREATE CLUSTER REPLICAS (...)`, so we query mz_clusters first.
+existing=$(
+    psql -h "$PGHOST" -p "$PGPORT_INTERNAL" -U "$PGUSER_INTERNAL" -tAc \
+        "SELECT 1 FROM mz_clusters WHERE name = '$CLUSTER'"
+)
+if [[ -z "$existing" ]]; then
+    echo "Provisioning cluster '$CLUSTER' against clusterd1..."
+    psql -h "$PGHOST" -p "$PGPORT_INTERNAL" -U "$PGUSER_INTERNAL" <<SQL
+CREATE CLUSTER ${CLUSTER} REPLICAS (replica1 (
+    STORAGECTL ADDRESSES ['clusterd1:2100'],
+    STORAGE ADDRESSES ['clusterd1:2103'],
+    COMPUTECTL ADDRESSES ['clusterd1:2101'],
+    COMPUTE ADDRESSES ['clusterd1:2102'],
+    WORKERS 1
+));
+GRANT ALL ON CLUSTER ${CLUSTER} TO ${PGUSER};
+SQL
+else
+    echo "Cluster '$CLUSTER' already exists; skipping provisioning."
+fi
+
 # Emit setup_complete — Antithesis begins test commands after this.
 /usr/local/bin/setup-complete.sh
 

From 359402a326ff4b95d3e6ba6248fea51f1858298e Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Mon, 11 May 2026 14:52:58 -0400
Subject: [PATCH 13/65] ci: regenerate antithesis compose YAML before build
 (avoid stale fingerprints)

---
 ci/nightly/pipeline.template.yml |  7 ++++++-
 ci/test/build-antithesis.sh      | 33 ++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 1 deletion(-)
 create mode 100755 ci/test/build-antithesis.sh

diff --git a/ci/nightly/pipeline.template.yml b/ci/nightly/pipeline.template.yml
index 065fbe0488b6f..b3c3068e04970 100644
--- a/ci/nightly/pipeline.template.yml
+++ b/ci/nightly/pipeline.template.yml
@@ -67,7 +67,12 @@ steps:
 
       - id: build-x86_64-antithesis
         label: ":rust: Build x86_64 (Antithesis)"
-        command: bin/ci-builder run stable bin/pyactivate -m ci.test.build
+        # Regenerate the antithesis compose YAML before building so the
+        # `antithesis-config` image's fingerprint captures the same
+        # materialized fingerprint we're about to publish — otherwise
+        # Antithesis would try to pull a stale `materialized:mzbuild-…`
+        # whenever the committed YAML lagged behind source changes.
+        command: bin/ci-builder run stable ci/test/build-antithesis.sh
         inputs:
           - "*"
         depends_on: []
diff --git a/ci/test/build-antithesis.sh b/ci/test/build-antithesis.sh
new file mode 100755
index 0000000000000..0eb0788b89cc1
--- /dev/null
+++ b/ci/test/build-antithesis.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+#
+# build-antithesis.sh — antithesis-flavored build entry point.
+#
+# Regenerates test/antithesis/config/docker-compose.yaml against the
+# current source tree before invoking ci.test.build, so that the
+# `antithesis-config` mzbuild image bakes in a compose YAML whose
+# materialized/antithesis-workload fingerprints match the fingerprints
+# this build is about to publish to GHCR.
+#
+# The committed YAML in test/antithesis/config/docker-compose.yaml is for
+# human review (PR diffs); its fingerprints can drift on every materialized
+# source change, and the staleness lint masks them by design. This script
+# is what guarantees Antithesis sees a self-consistent compose.
+
+set -euo pipefail
+
+: "${CI_ANTITHESIS:?build-antithesis.sh expects CI_ANTITHESIS=1}"
+
+echo "--- Regenerating test/antithesis/config/docker-compose.yaml"
+bin/pyactivate test/antithesis/export-compose.py \
+    > test/antithesis/config/docker-compose.yaml
+
+exec bin/pyactivate -m ci.test.build

From 2cfa6a3054eb1af5bf40b6f0fc921ae3ba66be8f Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Mon, 11 May 2026 14:56:16 -0400
Subject: [PATCH 14/65] test/antithesis: parameterize compose via .env (no more
 baked-in fingerprints)

---
 .gitignore                                    |  2 +
 ci/test/build-antithesis.sh                   | 50 ++++++++----
 .../checks/check-antithesis-compose.sh        | 13 ++-
 test/antithesis/Makefile                      | 13 ++-
 test/antithesis/config/Dockerfile             |  9 ++-
 test/antithesis/config/docker-compose.yaml    |  6 +-
 test/antithesis/config/mzbuild.yml            | 21 +++--
 test/antithesis/export-compose.py             | 59 +++++++-------
 test/antithesis/export-env.py                 | 81 +++++++++++++++++++
 test/antithesis/push-antithesis.py            | 79 ++++++++++++++++++
 10 files changed, 266 insertions(+), 67 deletions(-)
 create mode 100644 test/antithesis/export-env.py
 create mode 100755 test/antithesis/push-antithesis.py

diff --git a/.gitignore b/.gitignore
index 6eb7e16708d6f..58321fab14d4f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,6 +18,8 @@
 mzdata
 mzbuild
 __pycache__
+# Antithesis compose env file — generated by build-antithesis.sh / make build.
+/test/antithesis/config/.env
 .mypy_cache
 venv
 node_modules
diff --git a/ci/test/build-antithesis.sh b/ci/test/build-antithesis.sh
index 0eb0788b89cc1..ef9d24d7c420c 100755
--- a/ci/test/build-antithesis.sh
+++ b/ci/test/build-antithesis.sh
@@ -9,25 +9,45 @@
 # the Business Source License, use of this software will be governed
 # by the Apache License, Version 2.0.
 #
-# build-antithesis.sh — antithesis-flavored build entry point.
+# build-antithesis.sh — antithesis-flavored build + Antithesis-registry push.
 #
-# Regenerates test/antithesis/config/docker-compose.yaml against the
-# current source tree before invoking ci.test.build, so that the
-# `antithesis-config` mzbuild image bakes in a compose YAML whose
-# materialized/antithesis-workload fingerprints match the fingerprints
-# this build is about to publish to GHCR.
-#
-# The committed YAML in test/antithesis/config/docker-compose.yaml is for
-# human review (PR diffs); its fingerprints can drift on every materialized
-# source change, and the staleness lint masks them by design. This script
-# is what guarantees Antithesis sees a self-consistent compose.
+# 1. Write `.env` so `antithesis-config` bakes in compose refs that point
+#    at the Antithesis GCP Artifact Registry (where we'll mirror to). The
+#    .env content is one of antithesis-config's mzbuild inputs, so the
+#    image fingerprint tracks the source it references — self-consistent.
+# 2. Run the standard `ci.test.build` to compile antithesis-flavored Rust
+#    binaries and build the docker images (pushed to GHCR via mzbuild).
+# 3. `docker login` the GCP Artifact Registry using
+#    `GCP_SERVICE_ACCOUNT_JSON` (already forwarded into ci-builder).
+# 4. Retag + push `materialized`, `antithesis-workload`, and
+#    `antithesis-config` to the Antithesis registry. Public images
+#    referenced by the compose (postgres, minio, kafka stack) stay on
+#    their upstream registries — Antithesis can reach those directly.
 
 set -euo pipefail
 
 : "${CI_ANTITHESIS:?build-antithesis.sh expects CI_ANTITHESIS=1}"
 
-echo "--- Regenerating test/antithesis/config/docker-compose.yaml"
-bin/pyactivate test/antithesis/export-compose.py \
-    > test/antithesis/config/docker-compose.yaml
+# GCP Artifact Registry path for Antithesis. Tags pushed under
+# $ANTITHESIS_REGISTRY/<name>:mzbuild-<fingerprint>.
+ANTITHESIS_REGISTRY="${ANTITHESIS_REGISTRY:-us-central1-docker.pkg.dev/molten-verve-216720/materialize-repository}"
+
+echo "--- Writing test/antithesis/config/.env (registry: $ANTITHESIS_REGISTRY)"
+bin/pyactivate test/antithesis/export-env.py \
+    --registry "$ANTITHESIS_REGISTRY" \
+    > test/antithesis/config/.env
+
+echo "--- Building antithesis-flavored mzbuild images"
+bin/pyactivate -m ci.test.build
+
+echo "--- Authenticating to Antithesis registry"
+if [[ -z "${GCP_SERVICE_ACCOUNT_JSON:-}" ]]; then
+    echo "GCP_SERVICE_ACCOUNT_JSON is unset — pushing to the Antithesis registry will fail." >&2
+    echo "Provision it as a Buildkite-agent env var (see bin/ci-builder env-forwarding)." >&2
+    exit 1
+fi
+echo "$GCP_SERVICE_ACCOUNT_JSON" \
+    | docker login -u _json_key --password-stdin "https://${ANTITHESIS_REGISTRY%%/*}"
 
-exec bin/pyactivate -m ci.test.build
+echo "--- Pushing Materialize-built images to the Antithesis registry"
+bin/pyactivate test/antithesis/push-antithesis.py --registry "$ANTITHESIS_REGISTRY"
diff --git a/ci/test/lint-main/checks/check-antithesis-compose.sh b/ci/test/lint-main/checks/check-antithesis-compose.sh
index add2f2a0dab57..55c54f0bccfba 100755
--- a/ci/test/lint-main/checks/check-antithesis-compose.sh
+++ b/ci/test/lint-main/checks/check-antithesis-compose.sh
@@ -12,9 +12,10 @@
 # check-antithesis-compose.sh — ensure test/antithesis/config/docker-compose.yaml
 # is in sync with test/antithesis/mzcompose.py.
 #
-# Fingerprint refs (`mzbuild-<fp>`) shift on every materialized code change,
-# so we mask them before diffing — we only want to catch composition
-# (services, ports, env, deps) drift, not transient fingerprint churn.
+# Image refs in the committed YAML are `${MATERIALIZED_IMAGE}` style
+# placeholders (resolved from `.env` at compose-parse time), so the file is
+# stable across materialized source changes. A plain diff catches any
+# composition (services/ports/env/deps) drift.
 
 set -euo pipefail
 
@@ -29,11 +30,7 @@ check_antithesis_compose() {
 
     bin/pyactivate test/antithesis/export-compose.py > "$generated"
 
-    # Mask `mzbuild-<FINGERPRINT>` so the diff is structural-only.
-    local mask='s/(mzbuild-)[A-Z0-9]+/\1FINGERPRINT/g'
-    if ! diff -u \
-        <(sed -E "$mask" "$committed") \
-        <(sed -E "$mask" "$generated"); then
+    if ! diff -u "$committed" "$generated"; then
         echo
         echo "$committed is out of sync with test/antithesis/mzcompose.py."
         echo "Regenerate with:"
diff --git a/test/antithesis/Makefile b/test/antithesis/Makefile
index f25077b582d33..878bf7e384019 100644
--- a/test/antithesis/Makefile
+++ b/test/antithesis/Makefile
@@ -41,24 +41,29 @@ ifeq ($(RUNTIME),podman)
 endif
 
 COMPOSE_FILE := $(REPO_ROOT)/test/antithesis/config/docker-compose.yaml
-COMPOSE      := $(RUNTIME) compose -p $(PROJECT) -f $(COMPOSE_FILE)
+ENV_FILE     := $(REPO_ROOT)/test/antithesis/config/.env
+COMPOSE      := $(RUNTIME) compose -p $(PROJECT) --env-file $(ENV_FILE) -f $(COMPOSE_FILE)
 PSQL         := $(COMPOSE) exec materialized psql -h localhost -p 6875 -U materialize
 
 # mzbuild images we need built locally. Third-party images (postgres, minio,
-# redpanda) are pulled by `docker compose` from their upstream registries.
+# kafka, …) are pulled by `docker compose` from their upstream registries.
 MZBUILD_IMAGES := materialized antithesis-workload
 
 # ---------------------------------------------------------------------------
 # Build
 # ---------------------------------------------------------------------------
-.PHONY: build export-compose acquire-images
+.PHONY: build export-compose export-env acquire-images
 
-build: export-compose acquire-images
+build: export-compose export-env acquire-images
 
 export-compose:
 	cd $(REPO_ROOT) && bin/pyactivate test/antithesis/export-compose.py > $(COMPOSE_FILE)
 	@echo "Wrote $(COMPOSE_FILE)"
 
+export-env:
+	cd $(REPO_ROOT) && bin/pyactivate test/antithesis/export-env.py > $(ENV_FILE)
+	@echo "Wrote $(ENV_FILE)"
+
 acquire-images:
 	@for image in $(MZBUILD_IMAGES); do \
 	  echo "--- Acquiring $$image (--antithesis)"; \
diff --git a/test/antithesis/config/Dockerfile b/test/antithesis/config/Dockerfile
index 386049db7e8e5..32fcb07e30460 100644
--- a/test/antithesis/config/Dockerfile
+++ b/test/antithesis/config/Dockerfile
@@ -7,9 +7,12 @@
 # the Business Source License, use of this software will be governed
 # by the Apache License, Version 2.0.
 
-# Antithesis config image: a FROM-scratch tarball holding the resolved
+# Antithesis config image: a FROM-scratch tarball holding the
 # docker-compose.yaml that Antithesis uses to bring up the system under
-# test. See mzbuild.yml for regeneration instructions.
+# test, plus a `.env` mapping `${MATERIALIZED_IMAGE}` /
+# `${ANTITHESIS_WORKLOAD_IMAGE}` to current mzbuild fingerprints. Compose
+# loads `.env` automatically at parse time. See mzbuild.yml for
+# regeneration instructions.
 
 FROM scratch
-COPY docker-compose.yaml /
+COPY docker-compose.yaml .env /
diff --git a/test/antithesis/config/docker-compose.yaml b/test/antithesis/config/docker-compose.yaml
index 26819190cd164..73291200a043c 100644
--- a/test/antithesis/config/docker-compose.yaml
+++ b/test/antithesis/config/docker-compose.yaml
@@ -209,7 +209,7 @@ services:
     restart: 'no'
     stop_grace_period: 120s
     platform: linux/amd64
-    image: ghcr.io/materializeinc/materialize/materialized:mzbuild-IIJDLZ77L7R7ZOGPATZYUWRESDF5LDN7
+    image: ${MATERIALIZED_IMAGE}
   materialized:
     hostname: materialized
     depends_on:
@@ -397,7 +397,7 @@ services:
       start_period: 600s
     stop_grace_period: 120s
     platform: linux/amd64
-    image: ghcr.io/materializeinc/materialize/materialized:mzbuild-IIJDLZ77L7R7ZOGPATZYUWRESDF5LDN7
+    image: ${MATERIALIZED_IMAGE}
   workload:
     depends_on:
       materialized:
@@ -418,7 +418,7 @@ services:
     - SCHEMA_REGISTRY_URL=http://schema-registry:8081
     - MZ_ANTITHESIS_CLUSTER=antithesis_cluster
     platform: linux/amd64
-    image: ghcr.io/materializeinc/materialize/antithesis-workload:mzbuild-XX2UEHO746TTSXP3JUOIMJTYD2WWEBLY
+    image: ${ANTITHESIS_WORKLOAD_IMAGE}
 networks: {}
 volumes:
   mzdata: null
diff --git a/test/antithesis/config/mzbuild.yml b/test/antithesis/config/mzbuild.yml
index 899d620d1285f..07011b460f407 100644
--- a/test/antithesis/config/mzbuild.yml
+++ b/test/antithesis/config/mzbuild.yml
@@ -7,13 +7,20 @@
 # the Business Source License, use of this software will be governed
 # by the Apache License, Version 2.0.
 
-# FROM-scratch image holding the resolved docker-compose.yaml for the
-# Antithesis environment. Antithesis pulls this image and reads the compose
-# spec from `/docker-compose.yaml` to bring up the system under test.
+# FROM-scratch image holding the docker-compose.yaml + .env for the
+# Antithesis environment. Antithesis pulls this image and reads
+# `/docker-compose.yaml` to bring up the system under test; `.env` supplies
+# `${MATERIALIZED_IMAGE}` / `${ANTITHESIS_WORKLOAD_IMAGE}` at compose-parse
+# time.
 #
-# The compose file is generated from test/antithesis/mzcompose.py via
-# `bin/pyactivate test/antithesis/export-compose.py`. Re-run that whenever
-# the composition topology changes; CI verifies the committed copy is up to
-# date.
+# The compose YAML (committed, topology-only) is generated from
+# `test/antithesis/mzcompose.py` via `bin/pyactivate
+# test/antithesis/export-compose.py`. Regenerate when topology changes; CI
+# verifies the committed copy is up to date.
+#
+# `.env` (generated, gitignored) is written by
+# `bin/pyactivate test/antithesis/export-env.py` at build time. Its content
+# changes every materialized fingerprint shift, which is what propagates
+# fresh fingerprints into this image without touching the committed YAML.
 
 name: antithesis-config
diff --git a/test/antithesis/export-compose.py b/test/antithesis/export-compose.py
index 081ce78ed41db..dcab7c16a2866 100644
--- a/test/antithesis/export-compose.py
+++ b/test/antithesis/export-compose.py
@@ -11,20 +11,24 @@
 
 """Export the resolved docker-compose YAML for the Antithesis composition.
 
-Loads `test/antithesis/mzcompose.py`, resolves every `mzbuild:` reference,
-and dumps the resulting docker-compose dict to stdout. Antithesis pulls the
-referenced images directly from public GHCR — no separate registry, no
-re-tagging.
+Loads `test/antithesis/mzcompose.py` and dumps a docker-compose YAML to
+stdout where Materialize-built images are emitted as compose env-var
+placeholders (`${MATERIALIZED_IMAGE}`, `${ANTITHESIS_WORKLOAD_IMAGE}`).
+The actual fingerprint values are supplied separately in a `.env` file
+generated by `export-env.py`. This separation lets the committed YAML stay
+stable across materialized source changes — only `.env` shifts per
+fingerprint.
 
 Image-reference policy:
 
-  * Materialize-built images (`materialized`, `antithesis-workload`) are
-    emitted as `ghcr.io/materializeinc/materialize/<name>:mzbuild-<fp>`.
-    The fingerprint participates in `antithesis=True` so antithesis builds
-    don't collide with regular builds.
+  * Materialize-built images (`materialized`, `antithesis-workload`)
+    become `${MATERIALIZED_IMAGE}` / `${ANTITHESIS_WORKLOAD_IMAGE}`.
+    Compose interpolates them from `.env` at parse time. The actual specs
+    are `ghcr.io/materializeinc/materialize/<name>:mzbuild-<fp>` with
+    `antithesis=True` participating in the fingerprint.
 
-  * Third-party `mzbuild` images (`postgres`, `minio`) are replaced with the
-    public upstream image. Our mzbuild variants bake in test-friendly
+  * Third-party `mzbuild` images (`postgres`, `minio`) are replaced with
+    the public upstream image. Our mzbuild variants bake in test-friendly
     patches (eatmydata, no_fsync) that defeat Antithesis's fault injection;
     Antithesis runs against vanilla.
 
@@ -48,12 +52,18 @@
 from materialize.mzbuild import Repository
 from materialize.mzcompose.composition import Composition
 
-# mzbuild image names that we publish to GHCR and want Antithesis to pull
-# under our fingerprint. Everything else falls back to a public image.
-MATERIALIZE_IMAGES = {"materialized", "antithesis-workload"}
+# mzbuild image names that we publish under our fingerprint. Each maps to
+# the compose env-var placeholder; `.env` (export-env.py) supplies the
+# concrete ref at compose-parse time. Keep in sync with `export-env.py`.
+MATERIALIZE_IMAGES = {
+    "materialized": "${MATERIALIZED_IMAGE}",
+    "antithesis-workload": "${ANTITHESIS_WORKLOAD_IMAGE}",
+}
 
 # Public-image fallbacks for mzbuild images whose Materialize-specific
 # customizations subvert Antithesis (eatmydata, fsync no-ops, etc.).
+# Antithesis can reach public registries — we just need to make sure the
+# compose points at the upstream image, not our patched mzbuild build.
 PUBLIC_FALLBACKS = {
     "postgres": "postgres:17.7",
     "minio": "minio/minio:latest",
@@ -78,18 +88,18 @@
 """
 
 
-def resolve_mzbuild(svc: dict[str, Any], deps: Any) -> None:
-    """Replace `mzbuild:` with a concrete `image:` ref."""
+def resolve_mzbuild(svc: dict[str, Any]) -> None:
+    """Replace `mzbuild:` with a concrete or templated `image:` ref."""
     name = svc.pop("mzbuild")
     if name in MATERIALIZE_IMAGES:
-        svc["image"] = deps[name].spec()
+        svc["image"] = MATERIALIZE_IMAGES[name]
     elif name in PUBLIC_FALLBACKS:
         svc["image"] = PUBLIC_FALLBACKS[name]
     else:
         raise ValueError(
             f"mzbuild image {name!r} has no Antithesis policy — add it to "
-            f"MATERIALIZE_IMAGES (use our GHCR build) or PUBLIC_FALLBACKS "
-            f"(swap to a public image) in export-compose.py."
+            f"MATERIALIZE_IMAGES (use a `.env` placeholder) or "
+            f"PUBLIC_FALLBACKS (swap to a public image) in export-compose.py."
         )
 
 
@@ -181,21 +191,16 @@ def strip_mzcompose_keys(svc: dict[str, Any]) -> None:
 def main() -> None:
     # munge_services=False keeps ports bare (e.g., `6875` instead of
     # `127.0.0.1::6875`) — Antithesis is container-to-container, no host
-    # binding. We do our own mzbuild→image substitution below.
+    # binding. We do our own mzbuild→image substitution below and don't
+    # need fingerprint resolution since Materialize-built images become
+    # `${...}` placeholders.
     repo = Repository(Path("."), arch="x86_64", antithesis=True)
     c = Composition(repo, "antithesis", munge_services=False)
 
-    images = [
-        repo.images[svc["mzbuild"]]
-        for svc in c.compose["services"].values()
-        if "mzbuild" in svc
-    ]
-    deps = repo.resolve_dependencies(images)
-
     for svc in c.compose["services"].values():
         svc["platform"] = "linux/amd64"
         if "mzbuild" in svc:
-            resolve_mzbuild(svc, deps)
+            resolve_mzbuild(svc)
         inline_postgres_setup(svc)
         strip_host_bindmounts(svc)
         strip_incompatible_env(svc)
diff --git a/test/antithesis/export-env.py b/test/antithesis/export-env.py
new file mode 100644
index 0000000000000..c1ea463d80e2e
--- /dev/null
+++ b/test/antithesis/export-env.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Emit the `.env` file consumed by Antithesis's docker-compose.yaml.
+
+The compose YAML (export-compose.py) is committed with `${MATERIALIZED_IMAGE}`
+/ `${ANTITHESIS_WORKLOAD_IMAGE}` placeholders so it stays stable across
+materialized source changes. This script writes the corresponding `.env`
+with the current mzbuild fingerprints so compose can interpolate them.
+
+Run at CI build time (build-antithesis.sh) and at local-dev `make build`.
+The `antithesis-config` mzbuild image copies in the .env produced by this
+script, so the image's own fingerprint tracks the materialized fingerprint
+transitively — same materialized → same .env → same antithesis-config.
+
+With `--registry`, the emitted refs use that registry prefix instead of
+the default (whatever `spec()` returns based on `MZ_GHCR`). CI passes the
+Antithesis GCP Artifact Registry so the compose Antithesis pulls
+references images at the registry Antithesis can actually reach.
+
+Usage:
+    bin/pyactivate test/antithesis/export-env.py \\
+        > test/antithesis/config/.env
+    bin/pyactivate test/antithesis/export-env.py \\
+        --registry us-central1-docker.pkg.dev/molten-verve-216720/materialize-repository \\
+        > test/antithesis/config/.env
+"""
+
+import argparse
+import sys
+from pathlib import Path
+
+from materialize.mzbuild import Repository
+
+# Mapping of `.env` variable name → mzbuild image name. Keep in sync with
+# MATERIALIZE_IMAGES in export-compose.py.
+ENV_VARS = {
+    "MATERIALIZED_IMAGE": "materialized",
+    "ANTITHESIS_WORKLOAD_IMAGE": "antithesis-workload",
+}
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__.splitlines()[0])
+    parser.add_argument(
+        "--registry",
+        default=None,
+        help=(
+            "Registry prefix to use for emitted refs. If unset, uses the "
+            "default `spec()` (GHCR when MZ_GHCR=1, else Docker Hub)."
+        ),
+    )
+    args = parser.parse_args()
+
+    repo = Repository(Path("."), arch="x86_64", antithesis=True)
+    images = [repo.images[name] for name in ENV_VARS.values()]
+    deps = repo.resolve_dependencies(images)
+
+    sys.stdout.write(
+        "# GENERATED FILE — do not edit. Regenerate via:\n"
+        "#   bin/pyactivate test/antithesis/export-env.py > test/antithesis/config/.env\n"
+        "# Consumed by test/antithesis/config/docker-compose.yaml at compose-parse time.\n"
+    )
+    for var, image_name in ENV_VARS.items():
+        if args.registry:
+            ref = f"{args.registry}/{image_name}:mzbuild-{deps[image_name].fingerprint()}"
+        else:
+            ref = deps[image_name].spec()
+        sys.stdout.write(f"{var}={ref}\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/antithesis/push-antithesis.py b/test/antithesis/push-antithesis.py
new file mode 100755
index 0000000000000..83346214ac841
--- /dev/null
+++ b/test/antithesis/push-antithesis.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Retag + push antithesis-flavored images to Antithesis's GCP registry.
+
+Antithesis's sandbox pulls images by reference. Our standard mzbuild flow
+publishes to GHCR with `mzbuild-<fp>` tags, but new GHCR packages default
+to private visibility — Antithesis hits a 4001 (image-not-reachable) when
+trying to pull them. Pushing to a GCP Artifact Registry whose IAM grants
+Antithesis read access avoids the visibility dance entirely.
+
+This script presumes `ci.test.build` has already run (so the source images
+exist locally) and that `docker login` against the target registry has
+already happened (build-antithesis.sh handles that via
+GCP_SERVICE_ACCOUNT_JSON).
+
+Usage:
+    bin/pyactivate test/antithesis/push-antithesis.py \\
+        --registry us-central1-docker.pkg.dev/molten-verve-216720/materialize-repository
+"""
+
+import argparse
+from pathlib import Path
+
+from materialize import spawn, ui
+from materialize.mzbuild import Repository
+
+# Images Antithesis needs to be able to pull:
+#   - antithesis-config holds the docker-compose.yaml + .env Antithesis runs.
+#   - materialized + antithesis-workload are referenced by that compose.
+ANTITHESIS_IMAGES = ["materialized", "antithesis-workload", "antithesis-config"]
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__.splitlines()[0])
+    parser.add_argument(
+        "--registry",
+        required=True,
+        help="Antithesis registry prefix, e.g. us-central1-docker.pkg.dev/molten-verve-216720/materialize-repository",
+    )
+    args = parser.parse_args()
+
+    # Match the Repository configuration used by ci.test.build so that
+    # `deps[name].spec()` returns the same local tag that build actually
+    # produced (materialize/<name>:mzbuild-<fp>, not the GHCR-prefixed one).
+    repo = Repository(
+        Path("."),
+        arch="x86_64",
+        antithesis=True,
+        image_registry="materialize",
+    )
+    deps = repo.resolve_dependencies([repo.images[name] for name in ANTITHESIS_IMAGES])
+
+    # Ensure each image is actually present locally before retag — ci.test.build's
+    # `ensure()` path may short-circuit to "already pushed" without leaving a
+    # local copy if the fingerprint was already in the cache.
+    deps.acquire()
+
+    for name in ANTITHESIS_IMAGES:
+        resolved = deps[name]
+        source = resolved.spec()
+        target = f"{args.registry}/{name}:mzbuild-{resolved.fingerprint()}"
+        ui.section(f"Pushing {name}")
+        print(f"    source: {source}")
+        print(f"    target: {target}")
+        spawn.runv(["docker", "tag", source, target])
+        spawn.runv(["docker", "push", target])
+
+
+if __name__ == "__main__":
+    main()

From d4373eb8bdb3d9bd615bf36bca6ed4a9c553031f Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Mon, 11 May 2026 15:20:37 -0400
Subject: [PATCH 15/65] ci: distinct ANTITHESIS_GCP_SERVICE_ACCOUNT_JSON for
 Antithesis registry push

---
 bin/ci-builder              |  1 +
 ci/test/build-antithesis.sh | 13 ++++++++-----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/bin/ci-builder b/bin/ci-builder
index 0e81c806063d8..6d53be5cad2f5 100755
--- a/bin/ci-builder
+++ b/bin/ci-builder
@@ -281,6 +281,7 @@ case "$cmd" in
                 --env AZURE_SERVICE_ACCOUNT_PASSWORD
                 --env AZURE_SERVICE_ACCOUNT_TENANT
                 --env GCP_SERVICE_ACCOUNT_JSON
+                --env ANTITHESIS_GCP_SERVICE_ACCOUNT_JSON
                 --env GITHUB_TOKEN
                 --env GITHUB_GHCR_TOKEN
                 --env GPG_KEY
diff --git a/ci/test/build-antithesis.sh b/ci/test/build-antithesis.sh
index ef9d24d7c420c..23d9480ad8188 100755
--- a/ci/test/build-antithesis.sh
+++ b/ci/test/build-antithesis.sh
@@ -17,8 +17,11 @@
 #    image fingerprint tracks the source it references — self-consistent.
 # 2. Run the standard `ci.test.build` to compile antithesis-flavored Rust
 #    binaries and build the docker images (pushed to GHCR via mzbuild).
-# 3. `docker login` the GCP Artifact Registry using
-#    `GCP_SERVICE_ACCOUNT_JSON` (already forwarded into ci-builder).
+# 3. `docker login` the Antithesis GCP Artifact Registry using
+#    `ANTITHESIS_GCP_SERVICE_ACCOUNT_JSON` (a service account scoped to
+#    `materialize-storage@molten-verve-216720.iam.gserviceaccount.com` —
+#    kept distinct from `GCP_SERVICE_ACCOUNT_JSON` which is used elsewhere
+#    for unrelated GCP integrations).
 # 4. Retag + push `materialized`, `antithesis-workload`, and
 #    `antithesis-config` to the Antithesis registry. Public images
 #    referenced by the compose (postgres, minio, kafka stack) stay on
@@ -41,12 +44,12 @@ echo "--- Building antithesis-flavored mzbuild images"
 bin/pyactivate -m ci.test.build
 
 echo "--- Authenticating to Antithesis registry"
-if [[ -z "${GCP_SERVICE_ACCOUNT_JSON:-}" ]]; then
-    echo "GCP_SERVICE_ACCOUNT_JSON is unset — pushing to the Antithesis registry will fail." >&2
+if [[ -z "${ANTITHESIS_GCP_SERVICE_ACCOUNT_JSON:-}" ]]; then
+    echo "ANTITHESIS_GCP_SERVICE_ACCOUNT_JSON is unset — pushing to the Antithesis registry will fail." >&2
     echo "Provision it as a Buildkite-agent env var (see bin/ci-builder env-forwarding)." >&2
     exit 1
 fi
-echo "$GCP_SERVICE_ACCOUNT_JSON" \
+echo "$ANTITHESIS_GCP_SERVICE_ACCOUNT_JSON" \
     | docker login -u _json_key --password-stdin "https://${ANTITHESIS_REGISTRY%%/*}"
 
 echo "--- Pushing Materialize-built images to the Antithesis registry"

From 3278bda7f8757b2c326f93bd77f318683f31acaa Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Mon, 11 May 2026 16:06:21 -0400
Subject: [PATCH 16/65] test/antithesis: mark antithesis-config publish:false +
 commit placeholder .env
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

mzbuild's _build_locked runs `git clean -ffdX <image_path>` before each
build, which wipes any gitignored file in the build context — including
the .env we generate. Two fixes:

1. publish:false on antithesis-config so the standard ci.test.build flow
   skips it entirely on regular nightly builds (where .env never exists).
   Only build-antithesis.sh / push-antithesis.py builds this image, and
   they write .env first.

2. Commit a placeholder .env so the file is tracked (survives git clean)
   and participates in mzbuild's fingerprint computation. build-antithesis.sh
   overwrites it with real registry refs before the build runs;
   fingerprint reflects the overwritten content per build.
---
 .gitignore                         |  2 --
 test/antithesis/config/.env        | 21 +++++++++++++++++++++
 test/antithesis/config/mzbuild.yml |  7 +++++++
 3 files changed, 28 insertions(+), 2 deletions(-)
 create mode 100644 test/antithesis/config/.env

diff --git a/.gitignore b/.gitignore
index 58321fab14d4f..6eb7e16708d6f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,8 +18,6 @@
 mzdata
 mzbuild
 __pycache__
-# Antithesis compose env file — generated by build-antithesis.sh / make build.
-/test/antithesis/config/.env
 .mypy_cache
 venv
 node_modules
diff --git a/test/antithesis/config/.env b/test/antithesis/config/.env
new file mode 100644
index 0000000000000..d4f160a98596f
--- /dev/null
+++ b/test/antithesis/config/.env
@@ -0,0 +1,21 @@
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+# Compose env-file for `test/antithesis/config/docker-compose.yaml`.
+# Tracked by git only so that the file exists for mzbuild's input
+# fingerprinting and survives `git clean -ffdX` between builds. The
+# committed values are placeholders — `build-antithesis.sh` overwrites
+# them in CI with refs to images pushed to Antithesis's GCP Artifact
+# Registry, and `make export-env` does the same with local-dev refs.
+#
+# If you see these placeholder values on a running cluster, your build
+# pipeline did not regenerate this file. Run:
+#   bin/pyactivate test/antithesis/export-env.py > test/antithesis/config/.env
+MATERIALIZED_IMAGE=placeholder-not-built
+ANTITHESIS_WORKLOAD_IMAGE=placeholder-not-built
diff --git a/test/antithesis/config/mzbuild.yml b/test/antithesis/config/mzbuild.yml
index 07011b460f407..f3491f546dbb5 100644
--- a/test/antithesis/config/mzbuild.yml
+++ b/test/antithesis/config/mzbuild.yml
@@ -22,5 +22,12 @@
 # `bin/pyactivate test/antithesis/export-env.py` at build time. Its content
 # changes every materialized fingerprint shift, which is what propagates
 # fresh fingerprints into this image without touching the committed YAML.
+#
+# `publish: false` keeps the standard `ci.test.build` flow from trying to
+# build this image — it would fail on `COPY docker-compose.yaml .env /`
+# because `.env` is gitignored and only `build-antithesis.sh` writes it.
+# The antithesis nightly step builds and pushes the image directly via
+# push-antithesis.py.
 
 name: antithesis-config
+publish: false

From 007c7af9d9970fb2030c7212368b232e0fbc363e Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Mon, 11 May 2026 16:12:25 -0400
Subject: [PATCH 17/65] test/antithesis: pass Arch enum to Repository, not
 string

---
 test/antithesis/export-compose.py  | 3 ++-
 test/antithesis/export-env.py      | 3 ++-
 test/antithesis/push-antithesis.py | 3 ++-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/test/antithesis/export-compose.py b/test/antithesis/export-compose.py
index dcab7c16a2866..4e1fb5bece519 100644
--- a/test/antithesis/export-compose.py
+++ b/test/antithesis/export-compose.py
@@ -51,6 +51,7 @@
 from materialize import MZ_ROOT
 from materialize.mzbuild import Repository
 from materialize.mzcompose.composition import Composition
+from materialize.xcompile import Arch
 
 # mzbuild image names that we publish under our fingerprint. Each maps to
 # the compose env-var placeholder; `.env` (export-env.py) supplies the
@@ -194,7 +195,7 @@ def main() -> None:
     # binding. We do our own mzbuild→image substitution below and don't
     # need fingerprint resolution since Materialize-built images become
     # `${...}` placeholders.
-    repo = Repository(Path("."), arch="x86_64", antithesis=True)
+    repo = Repository(Path("."), arch=Arch.X86_64, antithesis=True)
     c = Composition(repo, "antithesis", munge_services=False)
 
     for svc in c.compose["services"].values():
diff --git a/test/antithesis/export-env.py b/test/antithesis/export-env.py
index c1ea463d80e2e..043c912cf1608 100644
--- a/test/antithesis/export-env.py
+++ b/test/antithesis/export-env.py
@@ -39,6 +39,7 @@
 from pathlib import Path
 
 from materialize.mzbuild import Repository
+from materialize.xcompile import Arch
 
 # Mapping of `.env` variable name → mzbuild image name. Keep in sync with
 # MATERIALIZE_IMAGES in export-compose.py.
@@ -60,7 +61,7 @@ def main() -> None:
     )
     args = parser.parse_args()
 
-    repo = Repository(Path("."), arch="x86_64", antithesis=True)
+    repo = Repository(Path("."), arch=Arch.X86_64, antithesis=True)
     images = [repo.images[name] for name in ENV_VARS.values()]
     deps = repo.resolve_dependencies(images)
 
diff --git a/test/antithesis/push-antithesis.py b/test/antithesis/push-antithesis.py
index 83346214ac841..2787f5cee8f30 100755
--- a/test/antithesis/push-antithesis.py
+++ b/test/antithesis/push-antithesis.py
@@ -32,6 +32,7 @@
 
 from materialize import spawn, ui
 from materialize.mzbuild import Repository
+from materialize.xcompile import Arch
 
 # Images Antithesis needs to be able to pull:
 #   - antithesis-config holds the docker-compose.yaml + .env Antithesis runs.
@@ -53,7 +54,7 @@ def main() -> None:
     # produced (materialize/<name>:mzbuild-<fp>, not the GHCR-prefixed one).
     repo = Repository(
         Path("."),
-        arch="x86_64",
+        arch=Arch.X86_64,
         antithesis=True,
         image_registry="materialize",
     )

From 8e459cdf56d46466fdcf0ba435ff40324e047c1a Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Mon, 11 May 2026 16:38:44 -0400
Subject: [PATCH 18/65] test/antithesis: kafka source property catalog + first
 workload property

Add 16 Antithesis properties for Kafka source ingestion (NONE + UPSERT
envelopes) to the scratchbook, plus the workload-side implementation of
upsert-key-reflects-latest-value.

Scratchbook additions:
  - sut-analysis Appendix A: kafka source pipeline detail
  - existing-assertions: enumerated SUT-side panic/assert sites that are
    candidates for Antithesis SDK instrumentation
  - property-catalog Category 7: 16 new Kafka/UPSERT properties
  - property-relationships clusters 7-10 plus cross-cluster connections
  - 16 per-property evidence files
  - evaluation/synthesis.md: four-lens review

Workload:
  - parallel_driver_upsert_latest_value.py: produces upserts+tombstones
    with deterministic randomness, requests a quiet period, polls
    mz_source_statistics for catchup, and asserts per-key value match
    (two always() assertions + one sometimes() liveness anchor).
  - helper_pg / helper_kafka / helper_quiet / helper_random /
    helper_source_stats / helper_upsert_source: shared utilities for
    subsequent Kafka source properties.
---
 test/antithesis/export-env.py                 |   4 +-
 .../scratchbook/evaluation/synthesis.md       |  81 +++++++
 .../scratchbook/existing-assertions.md        |  51 +++++
 .../kafka-source-frontier-monotonic.md        |  40 ++++
 .../kafka-source-no-data-duplication.md       |  44 ++++
 .../properties/kafka-source-no-data-loss.md   |  42 ++++
 .../kafka-source-no-internal-panic.md         |  44 ++++
 .../kafka-source-survives-broker-fault.md     |  40 ++++
 .../kafka-source-survives-clusterd-restart.md |  47 ++++
 .../offset-known-not-below-committed.md       |  39 ++++
 .../reclock-mint-eventually-succeeds.md       |  61 +++++
 .../remap-shard-antichain-wellformed.md       |  55 +++++
 .../upsert-decode-error-retractable.md        |  42 ++++
 ...ert-ensure-decoded-called-before-access.md |  43 ++++
 .../upsert-key-reflects-latest-value.md       |  63 ++++++
 .../properties/upsert-no-internal-panic.md    |  43 ++++
 .../upsert-state-consolidation-wellformed.md  |  75 +++++++
 .../upsert-state-rehydrates-correctly.md      |  46 ++++
 .../upsert-tombstone-removes-key.md           |  38 ++++
 .../scratchbook/property-catalog.md           | 197 +++++++++++++++-
 .../scratchbook/property-relationships.md     |  39 ++++
 test/antithesis/scratchbook/sut-analysis.md   |  81 +++++++
 test/antithesis/workload/test/helper_kafka.py |  90 ++++++++
 test/antithesis/workload/test/helper_pg.py    | 120 ++++++++++
 test/antithesis/workload/test/helper_quiet.py |  38 ++++
 .../antithesis/workload/test/helper_random.py |  64 ++++++
 .../workload/test/helper_source_stats.py      |  86 +++++++
 .../workload/test/helper_upsert_source.py     |  54 +++++
 .../parallel_driver_upsert_latest_value.py    | 211 ++++++++++++++++++
 29 files changed, 1869 insertions(+), 9 deletions(-)
 create mode 100644 test/antithesis/scratchbook/evaluation/synthesis.md
 create mode 100644 test/antithesis/scratchbook/properties/kafka-source-frontier-monotonic.md
 create mode 100644 test/antithesis/scratchbook/properties/kafka-source-no-data-duplication.md
 create mode 100644 test/antithesis/scratchbook/properties/kafka-source-no-data-loss.md
 create mode 100644 test/antithesis/scratchbook/properties/kafka-source-no-internal-panic.md
 create mode 100644 test/antithesis/scratchbook/properties/kafka-source-survives-broker-fault.md
 create mode 100644 test/antithesis/scratchbook/properties/kafka-source-survives-clusterd-restart.md
 create mode 100644 test/antithesis/scratchbook/properties/offset-known-not-below-committed.md
 create mode 100644 test/antithesis/scratchbook/properties/reclock-mint-eventually-succeeds.md
 create mode 100644 test/antithesis/scratchbook/properties/remap-shard-antichain-wellformed.md
 create mode 100644 test/antithesis/scratchbook/properties/upsert-decode-error-retractable.md
 create mode 100644 test/antithesis/scratchbook/properties/upsert-ensure-decoded-called-before-access.md
 create mode 100644 test/antithesis/scratchbook/properties/upsert-key-reflects-latest-value.md
 create mode 100644 test/antithesis/scratchbook/properties/upsert-no-internal-panic.md
 create mode 100644 test/antithesis/scratchbook/properties/upsert-state-consolidation-wellformed.md
 create mode 100644 test/antithesis/scratchbook/properties/upsert-state-rehydrates-correctly.md
 create mode 100644 test/antithesis/scratchbook/properties/upsert-tombstone-removes-key.md
 create mode 100644 test/antithesis/workload/test/helper_kafka.py
 create mode 100644 test/antithesis/workload/test/helper_pg.py
 create mode 100644 test/antithesis/workload/test/helper_quiet.py
 create mode 100644 test/antithesis/workload/test/helper_random.py
 create mode 100644 test/antithesis/workload/test/helper_source_stats.py
 create mode 100644 test/antithesis/workload/test/helper_upsert_source.py
 create mode 100755 test/antithesis/workload/test/parallel_driver_upsert_latest_value.py

diff --git a/test/antithesis/export-env.py b/test/antithesis/export-env.py
index 043c912cf1608..5488a0f097673 100644
--- a/test/antithesis/export-env.py
+++ b/test/antithesis/export-env.py
@@ -72,7 +72,9 @@ def main() -> None:
     )
     for var, image_name in ENV_VARS.items():
         if args.registry:
-            ref = f"{args.registry}/{image_name}:mzbuild-{deps[image_name].fingerprint()}"
+            ref = (
+                f"{args.registry}/{image_name}:mzbuild-{deps[image_name].fingerprint()}"
+            )
         else:
             ref = deps[image_name].spec()
         sys.stdout.write(f"{var}={ref}\n")
diff --git a/test/antithesis/scratchbook/evaluation/synthesis.md b/test/antithesis/scratchbook/evaluation/synthesis.md
new file mode 100644
index 0000000000000..fff919f61edf2
--- /dev/null
+++ b/test/antithesis/scratchbook/evaluation/synthesis.md
@@ -0,0 +1,81 @@
+# Property Catalog Evaluation — Kafka Source Additions
+
+**Scope**: The 16 properties added to Category 7 in `property-catalog.md` on 2026-05-11 targeting the Kafka source ingestion pipeline (NONE + UPSERT envelopes), and the assertion sites in `existing-assertions.md`. Pre-existing properties in Categories 1-6 are *not* re-evaluated here — they passed evaluation on 2026-05-06 and nothing has changed in their code paths. The 16 are: 5 user-visible Kafka source properties (`kafka-source-no-data-loss`, `-no-data-duplication`, `-frontier-monotonic`, `-survives-broker-fault`, `-survives-clusterd-restart`), 4 UPSERT envelope properties (`upsert-key-reflects-latest-value`, `-tombstone-removes-key`, `-state-rehydrates-correctly`, `-decode-error-retractable`), 3 UPSERT operator-internal properties (`upsert-no-internal-panic`, `-state-consolidation-wellformed`, `-ensure-decoded-called-before-access`), and 4 reclock / source-reader operator-internal properties (`kafka-source-no-internal-panic`, `remap-shard-antichain-wellformed`, `reclock-mint-eventually-succeeds`, `offset-known-not-below-committed`).
+
+This evaluation was performed in single-agent mode across the four lenses, written as a single synthesis. Per-lens evidence files are inline below; spawning four parallel ensemble agents for a 16-property targeted addition would have been over-engineering given that one human's worth of catalog review is the better fit.
+
+## Lens 1 — Antithesis Fit
+
+**Passes**:
+
+- All 16 properties target timing-sensitive, concurrency-sensitive, or partial-failure scenarios. None can be fully verified by a deterministic unit test.
+- Mix of assertion types is healthy: 7 Safety (`Always`), 3 Liveness (`Sometimes`), 3 Reachability (`Unreachable`), 2 properties combine multiple assertion families internally.
+- Several properties (`kafka-source-survives-clusterd-restart`, `upsert-state-rehydrates-correctly`, `reclock-mint-eventually-succeeds`) explicitly need fault injection that deterministic tests can't sequence — strong Antithesis fit.
+- The SUT-side instrumentation properties (`upsert-no-internal-panic`, `upsert-state-consolidation-wellformed`, `upsert-ensure-decoded-called-before-access`, `kafka-source-no-internal-panic`) wrap *existing* asserts/panics rather than adding new logic; this is the cheapest possible instrumentation cost.
+
+**Refinements**:
+
+- `offset-known-not-below-committed` is borderline unit-test material — the invariant could be tested by mocking the statistics update path. Kept in the catalog because the *interesting* failure is the restart-window timing, which is genuinely Antithesis territory; lowered priority from P1 to P2 (already P2 in the catalog).
+- `upsert-decode-error-retractable` could be tested as integration. It earns its catalog slot only if the test exercises crash recovery between the bad and good message; the evidence file already calls this out. No change needed.
+
+**Findings**: None. Antithesis fit is good across the addition.
+
+## Lens 2 — Coverage Balance
+
+**Passes**:
+
+- Both envelopes (NONE and UPSERT) get dedicated coverage.
+- The SUT analysis's Appendix A failure-prone areas table has 9 rows; 8 of them are covered by at least one new property. The one uncovered row is "Flag flip mid-append on persist sink (commit 68e1dfd86d)" — see Gap below.
+- Liveness, Safety, and Reachability are all represented.
+- Both workload-observable and SUT-side properties exist; the workload-only properties form the user-visible contract (`kafka-source-no-data-loss`, etc.) and the SUT-side properties form the operator-internal correctness backbone.
+
+**Gaps identified** (addressed during this pass — see "Addressing findings" below):
+
+- **G1: Persist sink flag-flip TOCTOU** — commit `68e1dfd86d` (database-issues#9585) regression is not represented. The bug was a config flag re-evaluated multiple times during `append_batches`. Decision: **Acknowledged but not added**. This is a persist-sink generic correctness property, not Kafka-source-specific; it belongs in Category 1 (Persist Layer Safety), not in the Kafka section. Filing as a follow-up note in `property-relationships.md` would clutter the relationships; instead, called out here as a known omission for a future persist-focused research pass.
+
+- **G2: Partition reassignment correctness** — Kafka topic adding/removing partitions while the source is live is mentioned in the SUT analysis but not captured as a property. The closest is `kafka-source-no-internal-panic` which catches *panics* on the rebalance path but not *correctness* (no data loss, no duplicates, correct partition→worker assignment under rebalance). Decision: **Catalog as a future expansion item**, not added in this pass because it requires non-trivial workload support (the test driver must be able to dynamically add Kafka partitions, and the worker-hash assignment property requires multi-worker clusterd).
+
+- **G3: Schema Registry interaction** — Avro / Protobuf decoding via Schema Registry is a significant Kafka source code path that is unmentioned. Schema evolution mid-source is a known operational hazard. Decision: **Future expansion item**. The workload is realistically text/JSON for v1 of these properties; Schema Registry coverage is a v2 expansion.
+
+**Refinements**:
+
+- The pre-existing `source-ingestion-progress` property is now redundant with `kafka-source-no-data-loss` for Kafka specifically. The relationships file calls this out. Decision: **Keep both** — `source-ingestion-progress` remains valid for non-Kafka sources (Postgres CDC, MySQL, generators), so it doesn't go away. The new property is more specific. No catalog edit needed beyond the cross-reference in `property-relationships.md`.
+
+## Lens 3 — Implementability
+
+**Passes**:
+
+- All workload-level properties can be checked via standard SQL queries against `mz_internal.mz_source_statistics_per_worker` and direct `SELECT` from the source. The workload only needs a PostgreSQL client and a Kafka producer (both already required by the existing topology in `deployment-topology.md`).
+- All SUT-side properties wrap *existing* code (panic / assert / unreachable sites). No new SUT instrumentation logic is required, only replacing the existing macro with the Antithesis SDK equivalent and giving each callsite a unique message.
+- Deployment topology already provides Kafka (Redpanda) and `materialized` in separate containers; network partition between them is a supported fault.
+- Multi-replica scenarios for `upsert-state-consolidation-wellformed` and the upsert internals require a topology variation (multiple compute replicas serving the same source). The existing topology is single-replica; this is flagged.
+
+**Refinements**:
+
+- `kafka-source-survives-clusterd-restart` requires **node-termination faults**, which the `faults.md` reference says are disabled by default in Antithesis tenants. Flagged in the evidence file. The user should confirm this fault class is enabled.
+- `upsert-state-consolidation-wellformed` (and `kafka-source-no-data-duplication` for the historical multi-replica regression) gain significant value from a multi-replica topology. Suggest adding a second topology variant to `deployment-topology.md` as a follow-up — single-replica is sufficient to start, but the multi-replica drain bug (commit `1accbe28b3`) requires multi-replica to reproduce.
+
+**Findings (refinements applied or noted in evidence files)**:
+
+- R1: Added a note to `properties/kafka-source-survives-clusterd-restart.md` calling out the node-termination-faults dependency.
+- R2: Added a note to `properties/upsert-state-consolidation-wellformed.md` explaining the multi-replica relevance.
+
+## Lens 4 — Wildcard
+
+**Things the other lenses missed**:
+
+- **W1: Multi-topic / multi-source interaction.** The 16 properties all treat a single Kafka source as the unit of analysis. The real-world failure mode of "two Kafka sources on the same cluster, one is healthy, the other is partitioned" is unaddressed. The `materialized` container hosts both; partitioning one source from its broker should not affect the other. Decision: **Future expansion**. Adding this now would expand the workload significantly.
+
+- **W2: Clock-jump interaction with Kafka timestamps.** The SUT analysis flags `expect("kafka sources always have upstream_time")` at kafka.rs:1209 — this depends on the Kafka message timestamp being valid. Clock jumps on the *Kafka broker* could produce future or past message timestamps. The current property set doesn't address how Materialize handles a backward-clocked Kafka broker. Decision: **Acknowledged as a known gap**, similar to W1.
+
+- **W3: Reading the catalog as a whole, the SUT-side instrumentation properties feel like a single "wrap all the existing panics in Antithesis SDK" project rather than four separate properties.** Decision: **Keep the four-property structure** anyway, because the slugs give Antithesis distinct property tags and the per-site message uniqueness requirement makes them genuinely distinct invariants. But operationally, a single PR can implement all four.
+
+## Addressing Findings
+
+- **Refinements applied**: R1, R2 (noted in evidence files during this pass).
+- **Gaps held as known omissions**: G1 (persist-sink flag flip — belongs in Category 1), G2 (partition reassignment — needs workload extension), G3 (schema registry — v2 expansion), W1 (multi-source interaction), W2 (clock jumps on broker).
+- **Biases escalated to user**: None — the catalog framing matches the user's stated scope ("basic properties for Kafka sources, both normal and upsert workloads"). The "basic" qualifier explicitly suggests that some areas like partition reassignment, schema registry, and multi-source scenarios are intentionally deferred to future passes.
+
+## Conclusion
+
+The 16-property Kafka source addition is implementable, well-scoped to Antithesis's strengths, and covers both envelopes plus the shared reclock layer. Known gaps are documented above as follow-up candidates. No biases escalated; the user's "basic" framing aligns with the catalog scope.
diff --git a/test/antithesis/scratchbook/existing-assertions.md b/test/antithesis/scratchbook/existing-assertions.md
index 8e423c26a0415..592d71d368c15 100644
--- a/test/antithesis/scratchbook/existing-assertions.md
+++ b/test/antithesis/scratchbook/existing-assertions.md
@@ -35,3 +35,54 @@ Full Kubernetes topology: environmentd StatefulSet, postgres StatefulSet, redpan
 ## Implications for New Work
 
 All property assertions will need to be added fresh. The existing integration provides a starting point for topology but uses an older approach (experiment scripts, custom instrumented images). The new approach should leverage mzcompose for compose generation and add Antithesis SDK assertions either in the workload client or (for deeper coverage) in the Materialize Rust source.
+
+## Storage/Kafka/UPSERT Path — Candidate Instrumentation Sites
+
+Added 2026-05-11 during Kafka-source property discovery. These are existing `panic!`/`assert!`/`unreachable!` sites in the storage code that are direct candidates for being wrapped with the Antithesis SDK so that violations surface as reportable property failures rather than process aborts. Confirmed by grepping the source at commit `007c7af9d9970fb2030c7212368b232e0fbc363e`.
+
+### `src/storage/src/source/kafka.rs`
+
+- `:158` — `expect("positive pid")`
+- `:265` — `expect("all source exports must be present in source resume uppers")`
+- `:276` — `panic!("unexpected source export details: {:?}", details)`
+- `:282` — `expect("statistics have been initialized")`
+- `:345` — `expect("restored kafka offsets must fit into i64")`
+- `:606, :853, :855, :891, :894, :897, :903, :907, :997` — various `expect()` and `assert!()` on reader state
+- `:1142-1147` — `assert!(self.last_offsets[output_index].contains_key(&partition))`
+- `:1193-1197` — `panic!("got negative offset (...) from otherwise non-error'd kafka message")`
+- `:1209` — `expect("kafka sources always have upstream_time")`
+- `:1457` — `assert!(…)` on payload structure
+
+### `src/storage/src/source/reclock.rs` and `reclock/compat.rs`
+
+- `reclock.rs:124` — `assert!(!new_into_upper.less_equal(&binding_ts))`
+- `reclock.rs:321` — `assert!(prev < RB::before(pid))`
+- `reclock/compat.rs:144` — `assert!(…)` on persist handle state
+- `reclock/compat.rs:306` — `panic!("compare_and_append failed: {invalid_use}")`
+
+### `src/storage/src/upsert.rs`
+
+- `:541` — `assert!(diff.is_positive(), "invalid upsert input")`
+- `:636` — `panic!("key missing from commands_state")`
+- `:1031` — `unreachable!("pending future never returns")`
+
+### `src/storage/src/upsert_continual_feedback.rs`
+
+- `:626` — `assert!(diff.is_positive(), "invalid upsert input")`
+- `:800` — `panic!("key missing from commands_state")`
+
+### `src/storage/src/upsert_continual_feedback_v2.rs`
+
+- `:315` — `assert!(diff.is_positive(), "invalid upsert input")`
+- `:483` — `unreachable!()` on `(None, None)` from joined prior/new state
+
+### `src/storage/src/upsert/types.rs` — `StateValue` and `ensure_decoded`
+
+- `:297, :369, :403, :416, :430, :440` — six `panic!("called \`<accessor>\` without calling \`ensure_decoded\`")` sites (`into_decoded`, `into_provisional_value`, `into_provisional_tombstone`, `provisional_order`, `provisional_value_ref`, `into_finalized_value`)
+- `:580` — `panic!("\`merge_update_state\` called with non-consolidating state")`
+- `:621` — `assert_eq!(checksum_sum.0, seahash::hash(value) as i64, …)` inside `ensure_decoded` (diff_sum == 1)
+- `:632, :637, :642` — three checks for `diff_sum == 0` (`len_sum`, `checksum_sum`, all-zero `value_xor`)
+- `:672` — `panic!("invalid upsert state: non 0/1 diff_sum: …")`
+- `:1062` — `panic!("attempted completion of already completed upsert snapshot")`
+
+Per the property catalog, each of these gets a *distinct, specific* Antithesis assertion message so a fired assertion names exactly the site reached. No site shares a message with another. See `properties/upsert-no-internal-panic.md`, `properties/upsert-state-consolidation-wellformed.md`, `properties/upsert-ensure-decoded-called-before-access.md`, and `properties/kafka-source-no-internal-panic.md` for the per-site rename table.
diff --git a/test/antithesis/scratchbook/properties/kafka-source-frontier-monotonic.md b/test/antithesis/scratchbook/properties/kafka-source-frontier-monotonic.md
new file mode 100644
index 0000000000000..03f551e5cbd9f
--- /dev/null
+++ b/test/antithesis/scratchbook/properties/kafka-source-frontier-monotonic.md
@@ -0,0 +1,40 @@
+# kafka-source-frontier-monotonic
+
+## Summary
+
+The `upper` frontier of the source's data persist shard never regresses across the source's lifetime, including across clusterd restarts and `compare_and_append` retries.
+
+## Code paths
+
+- `src/storage/src/render/persist_sink.rs` — `append_batches` calls `WriteHandle::compare_and_append`. Cached upper is the failure-prone spot (commit `505dc96aaa`: cached upper went stale under concurrent writers; fix uses `fetch_recent_upper`).
+- `src/storage/src/source/reclock.rs` — `ReclockOperator::sync`: must not let the operator's `upper` field regress across `compare_and_append` retries.
+- `src/storage/src/source/reclock/compat.rs:306` — `panic!("compare_and_append failed: {invalid_use}")`: this is the assertion that catches genuinely invalid persist calls (vs. legitimate `UpperMismatch` which is retried).
+
+## How to check it
+
+- Workload polls `mz_internal.mz_source_statistics_per_worker.offset_committed` (or equivalent shard upper view) on a tight cadence and `assert_always!(upper_monotonic, "kafka: source shard upper non-monotonic")` whenever a new sample is `< previous sample`.
+- SUT-side: in `append_batches`, immediately before `compare_and_append`, capture the previous upper from the local cached state and `assert_always!(new_upper >= prev_upper, "persist sink: upper regression on append")`. Distinct messages on the reclock side.
+
+## What goes wrong on violation
+
+Downstream operators panic when `as_of > upper` (the reclock-`as_of` race in commit `e3805ad790`, database-issues#8698, was exactly this shape). `AS OF` SQL queries return wrong results.
+
+## Antithesis angle
+
+- Kill clusterd mid-`compare_and_append`. On restart, the cached upper must be refreshed before the next append.
+- Concurrent reclock writers (two storage workers racing during a transient split-brain): both attempt CaS; only one wins; the other's local upper must catch up before it tries again.
+- Inject persist consensus latency to widen the cache-staleness window.
+
+## Open question (resolved)
+
+Q: Does the reclock retry loop in `ReclockOperator::mint` (reclock.rs:160-166) protect against this, or is the bug in code that doesn't go through `sync`?
+
+A: The retry loop does protect — but only if `sync()` is called *before* the local upper is used in subsequent code. The historical bug (`e3805ad790`) was in the `as_of` computation path which ran *outside* `mint` and used a cached upper from the read handle. Workload-level monotonicity assertion is sufficient to catch both paths.
+
+## Existing instrumentation
+
+None. The persist-side `panic!("compare_and_append failed: …")` in `reclock/compat.rs:306` is informational, not a property. Wrap with `assert_unreachable!` for the genuinely-invalid case and add an `assert_always!` for the workload-observable monotonicity.
+
+## Provenance
+
+Surfaced by: Data Integrity, Distributed Coordination. Direct regression target for commits `e3805ad790` and `505dc96aaa`.
diff --git a/test/antithesis/scratchbook/properties/kafka-source-no-data-duplication.md b/test/antithesis/scratchbook/properties/kafka-source-no-data-duplication.md
new file mode 100644
index 0000000000000..fba0e8348808f
--- /dev/null
+++ b/test/antithesis/scratchbook/properties/kafka-source-no-data-duplication.md
@@ -0,0 +1,44 @@
+# kafka-source-no-data-duplication
+
+## Summary
+
+After settling, the source contains no duplicates — at most one row per `(partition, offset)` for NONE-envelope and at most one row per key for UPSERT-envelope.
+
+## Why this property
+
+Duplication is the symmetric failure mode to `kafka-source-no-data-loss`. It is silent, propagates into every downstream aggregate, and historically arose in the upsert operator under multi-replica drain (commit `1accbe28b3`, database-issues#9160). It is the more dangerous of the two failure modes because it is harder to detect operationally — the workload sees "extra" rows that look plausible.
+
+## Code paths
+
+- `src/storage/src/source/kafka.rs:1158` — per-incarnation dedup against `last_offsets` (drops messages with offset `<= last_offset`). Per-incarnation only; does not survive restart.
+- `src/storage/src/render/persist_sink.rs` — the persist sink is responsible for ensuring writes are idempotent across restarts. Compare-and-append with idempotency tokens on retry handles the indeterminate-error case (compare with `idempotent-write-under-indeterminate`).
+- `src/storage/src/upsert_continual_feedback.rs` — `drain_staged_input`: the regression target for commit `1accbe28b3`. Single-replica clusters masked the bug because capabilities were always singletons; multi-replica drained the same staged input twice.
+- `src/storage/src/upsert.rs:541`, `upsert_continual_feedback*.rs` — `assert!(diff.is_positive(), "invalid upsert input")`. Retractions on the input would be the canonical "duplicate retraction" symptom.
+
+## How to check it
+
+Workload-level:
+- NONE envelope: `SELECT partition, "offset", COUNT(*) FROM source GROUP BY 1, 2 HAVING COUNT(*) > 1` returns 0 rows. Assert with `assert_always!(no_dupes, "kafka source: no duplicate (partition, offset)")`.
+- UPSERT envelope: `SELECT key, COUNT(*) FROM source GROUP BY 1 HAVING COUNT(*) > 1` returns 0 rows. Same assertion shape with a unique message.
+
+These run on every check fire, ideally on a polling cadence, not just at end-of-test.
+
+SUT-side: convert the existing `assert!(diff.is_positive(), "invalid upsert input")` into `assert_always!(diff.is_positive(), "upsert: input diff positive")` so a duplicate retraction surfaces as a property failure rather than a process abort. Distinct messages at each of the three callsites.
+
+## What goes wrong on violation
+
+Aggregates over the source double-count. Joins fan out. Downstream MVs become wrong in ways that are hard to attribute to ingestion.
+
+## Antithesis angle
+
+- Crash storage worker between `write_batches` and `append_batches`. Restart and verify that no `(partition, offset)` appears twice in the resulting persist shard.
+- For UPSERT: multi-replica cluster topology (the historical bug requires it). Run two replicas on the same source and observe the persisted output for duplicate retractions.
+- Race the upsert feedback-driven snapshot replay against new input.
+
+## Existing instrumentation
+
+The runtime `assert!` in upsert.rs already aborts on negative input diffs — it just doesn't surface as an Antithesis property. Wrapping each callsite with `assert_always!` (per-site unique message) gives Antithesis the signal it needs without changing semantics outside Antithesis (the underlying `assert!` already aborts on violation).
+
+## Provenance
+
+Surfaced by: Data Integrity, Concurrency, Failure Recovery. Direct regression target for database-issues#9160.
diff --git a/test/antithesis/scratchbook/properties/kafka-source-no-data-loss.md b/test/antithesis/scratchbook/properties/kafka-source-no-data-loss.md
new file mode 100644
index 0000000000000..2a451a32d4312
--- /dev/null
+++ b/test/antithesis/scratchbook/properties/kafka-source-no-data-loss.md
@@ -0,0 +1,42 @@
+# kafka-source-no-data-loss
+
+## Summary
+
+Every Kafka message produced by the workload is eventually visible in the source — either as a row (NONE envelope) or as the latest value for its key (UPSERT envelope).
+
+## Why this property
+
+This is the headline guarantee of a streaming database. The previous catalog entry `source-ingestion-progress` covered the generic "frontier advances" liveness signal; this property is the Kafka-specific, workload-checkable version that compares produced records against `SELECT` output.
+
+## Code paths
+
+- `src/storage/src/source/kafka.rs` — `render_reader`: the reader loop that drains `PartitionQueue`s, deduplicates against `last_offsets`, and emits `(SourceMessage, KafkaTimestamp, +1)` triples.
+- `src/storage/src/source/source_reader_pipeline.rs` — `create_raw_source`: assembles reader, remap, reclock.
+- `src/storage/src/source/reclock.rs` — `ReclockOperator::mint`: binds source timestamps to Materialize timestamps and persists the binding via `compare_and_append` on the remap shard.
+- `src/storage/src/render/persist_sink.rs` — `mint_batch_descriptions` → `write_batches` → `append_batches`: the path that actually puts rows into the source's data persist shard.
+- For UPSERT: `src/storage/src/upsert.rs` (`upsert_classic`) and the continual-feedback variants in `upsert_continual_feedback*.rs`.
+
+## How to check it
+
+Workload-level:
+1. The workload tracks every `(topic, partition, offset, key, value)` it produces.
+2. After produce settles, the workload calls `ANTITHESIS_STOP_FAULTS` and waits for `mz_internal.mz_source_statistics_per_worker` to report `offset_committed >= max_produced_offset`.
+3. The workload asserts via `assert_sometimes!("kafka source caught up to produced offsets", expected_rowcount_visible)` that `COUNT(*) FROM source >= produced_count` (NONE) or that the per-key latest-value model matches the source (UPSERT).
+
+SUT-side anchor: `assert_sometimes!(persist_sink_appended_batch)` inside `append_batches` after the first successful `compare_and_append` for this source.
+
+## What goes wrong on violation
+
+Silent data loss: the source ingests fewer rows than were produced; the workload sees a stall that doesn't resolve even with faults paused. Downstream MVs see incomplete data.
+
+## Antithesis angle
+
+The interesting window is mid-batch crash: a clusterd kill between the persist sink's `write_batches` (which uploads parts) and `append_batches` (which compare-and-appends). The resume frontier on restart determines what gets re-read. Bugs here look like: wrong resume offset (commit history: kafka.rs:1158 dedup is per-incarnation only — across restart, idempotency depends on persist-sink correctness).
+
+## Existing instrumentation
+
+None. No `assert_sometimes!` in the source path today (verified against `existing-assertions.md`). To implement: add an `assert_sometimes!` in the persist sink's `append_batches` after a successful append, plus a workload-side `assert_sometimes!` after the quiet-period catch-up check.
+
+## Provenance
+
+Surfaced by: Data Integrity, Failure Recovery, Product Context.
diff --git a/test/antithesis/scratchbook/properties/kafka-source-no-internal-panic.md b/test/antithesis/scratchbook/properties/kafka-source-no-internal-panic.md
new file mode 100644
index 0000000000000..6f6106aedbcce
--- /dev/null
+++ b/test/antithesis/scratchbook/properties/kafka-source-no-internal-panic.md
@@ -0,0 +1,44 @@
+# kafka-source-no-internal-panic
+
+## Summary
+
+The explicit panics and `assert!`s in the Kafka source reader never fire under any Antithesis-injected fault sequence. Each site is converted to a uniquely-messaged Antithesis assertion so a firing is a reportable property failure rather than a clusterd crash.
+
+## Targeted sites
+
+`src/storage/src/source/kafka.rs`:
+
+| Line | Site | Antithesis form |
+|------|------|------------------|
+| 276 | `panic!("unexpected source export details: {:?}", details)` | `assert_unreachable!("kafka: unexpected source export details")` |
+| 891 | `assert!(reader.partition_consumers.is_empty())` | `assert_always!(reader.partition_consumers.is_empty(), "kafka: partition_consumers not drained at shutdown")` |
+| 1142 | `assert!(self.last_offsets.get(output_index).unwrap().contains_key(&partition))` | `assert_always!(…, "kafka: partition missing from last_offsets")` |
+| 1193 | `panic!("got negative offset ({}) from otherwise non-error'd kafka message", msg.offset())` | `assert_unreachable!("kafka: negative offset from non-error message")` |
+| 1457 | `assert!(…)` (debug-mode payload validation) | `assert_always!(…, "kafka: payload check")` |
+
+Plus the cluster of `expect()` sites that are structurally similar — resume-upper missing (265), statistics not initialized (282), restored offset out of `i64` range (345), `position()` failure (606), `partition_known` lookup (853, 855), offset arithmetic (997, 1055, 1060, 1063, 1072, 1082), watermark not negative (1492). These are lower-priority but mass-conversion to `assert_always!(false, ...)` is cheap.
+
+## Why these sites matter
+
+- The "negative offset" panic at 1193 is the most interesting: rdkafka has shipped negative offsets in the past under certain protocol bugs, and an `i64` cast that wraps silently would be worse than the panic. Antithesis can reach this through manual broker-state manipulation in the workload.
+- The capability-downgrade assertion family (relevant to commit `99ad668af5`'s topic-recreation panic) — currently that code path *logs and continues* rather than panicking, but if a future refactor reintroduces a `panic!` on offset regression, this property catches it.
+- The `partition_consumers.is_empty()` assertion at 891 catches a shutdown-ordering bug that would manifest as a clusterd crash on source drop.
+
+## Antithesis angle
+
+- Topic deletion + recreation on the Kafka container. Specifically: drop a topic with offsets `[0..1000]`, recreate it with offsets `[0..100]` (lower watermark). The source's resume frontier sees `last_offset = 1000` and rdkafka delivers offset `100`. The dedup at kafka.rs:1158 handles this; the assertion at 1142 catches the case where the *partition itself* is missing from the dedup table.
+- Partition rebalance: increase Kafka topic partition count from the broker side mid-run. The metadata fetcher must discover and assign the new partitions correctly.
+- Manual offset reset: most relevant for the negative-offset panic at 1193.
+- Clock jumps: Kafka's internal timestamp arithmetic uses millisecond offsets; clock jitter has historically interacted poorly with the `expect("kafka sources always have upstream_time")` at line 1209.
+
+## Existing instrumentation
+
+The panics and asserts already exist. They currently abort clusterd. The work is wrapping each site with the Antithesis SDK so the abort becomes a reportable, replayable property failure. Each site uses a distinct message naming exactly the invariant violated.
+
+## Relationship to other properties
+
+This is the SUT-side counterpart to the workload-level `kafka-source-no-data-loss` and `kafka-source-no-data-duplication`. A workload-level row-count mismatch tells you data is wrong; a fired SUT-side assertion tells you *where* it went wrong.
+
+## Provenance
+
+Surfaced by: Failure Recovery, External Dependencies. Regression targets: commits `99ad668af5`, `3e32df1f69`.
diff --git a/test/antithesis/scratchbook/properties/kafka-source-survives-broker-fault.md b/test/antithesis/scratchbook/properties/kafka-source-survives-broker-fault.md
new file mode 100644
index 0000000000000..fd05df6b47e70
--- /dev/null
+++ b/test/antithesis/scratchbook/properties/kafka-source-survives-broker-fault.md
@@ -0,0 +1,40 @@
+# kafka-source-survives-broker-fault
+
+## Summary
+
+After a network partition or Kafka outage that prevents the source from making progress, once connectivity is restored the source resumes ingestion and eventually visits every message produced during the outage.
+
+## Code paths
+
+- `src/storage/src/source/kafka.rs` — `render_reader` polls per-partition `PartitionQueue`s. rdkafka's internal reconnect logic handles broker reconnect; the storage reader must not enter a permanent stall state when the consumer errors out.
+- `src/storage/src/healthcheck.rs` — the source's `HealthStatusUpdate` transitions: `Running` → `Stalled { hint }` during the outage → back to `Running` after recovery. `Ceased` would be a violation (terminal failure for a transient fault).
+- `src/storage/src/statistics.rs` — `offset_known` and `offset_committed` resume advancing post-recovery. The rehydration-latency reset (commit `0a34b6c79d`) is relevant if the reconnect goes through a dataflow restart.
+
+## How to check it
+
+Workload procedure:
+1. Produce N messages.
+2. Inject a network partition between the `materialized` container and the Kafka container. The partition isolates only that pair; persist/metadata remain reachable.
+3. Produce N more messages while the partition is active.
+4. Heal the partition (Antithesis fault scheduler) and call `ANTITHESIS_STOP_FAULTS`.
+5. Poll `mz_internal.mz_source_statistics_per_worker.offset_committed` until it advances past `max_produced_offset`. Bound the poll loop with a generous timeout.
+6. `assert_sometimes!(source_resumed_after_broker_fault, "kafka source resumed after Kafka container partition")`.
+
+## What goes wrong on violation
+
+The source enters a permanent stall: rdkafka thinks it's reconnected but the reader never re-reads; or the operator transitions to `Ceased` and the source must be manually dropped/recreated.
+
+## Antithesis angle
+
+- Bidirectional network partition: `materialized` ↔ Kafka.
+- Asymmetric partition: outbound packets to Kafka dropped but inbound responses allowed (or vice versa). rdkafka may not detect this and may sit waiting for a response forever.
+- Repeated short partitions: stress reconnect cadence.
+- Kafka container hang (CPU throttling to zero rather than network partition).
+
+## Existing instrumentation
+
+None. Workload-level `assert_sometimes!` is the entry point. Optional SUT-side: `assert_sometimes!(kafka_consumer_reconnected, ...)` inside the reader after rdkafka reports a successful reconnect.
+
+## Provenance
+
+Surfaced by: Failure Recovery, External Dependencies.
diff --git a/test/antithesis/scratchbook/properties/kafka-source-survives-clusterd-restart.md b/test/antithesis/scratchbook/properties/kafka-source-survives-clusterd-restart.md
new file mode 100644
index 0000000000000..072f374048ae6
--- /dev/null
+++ b/test/antithesis/scratchbook/properties/kafka-source-survives-clusterd-restart.md
@@ -0,0 +1,47 @@
+# kafka-source-survives-clusterd-restart
+
+## Summary
+
+After clusterd is killed and restarted, the Kafka source recovers its state, computes the correct resume offsets, and ingests messages produced before, during, and after the restart.
+
+## Code paths
+
+- `src/storage-client/src/controller.rs` — the storage controller's command-replay logic; this is the entry point for the `storage-command-replay-idempotent` property cluster.
+- `src/storage/src/storage_state.rs` — `RunIngestionCommand` handling. The async storage worker serializes ingestion vs. compaction (commit `3e5259782c`).
+- `src/storage/src/source/source_reader_pipeline.rs:481-493` — remap operator bootstraps by loading the entire initial batch from the remap shard before resuming new mints.
+- `src/storage/src/source/kafka.rs:346-349` — `start_offsets` derived from persisted resume frontier.
+- For UPSERT: `src/storage/src/upsert.rs` and `upsert_continual_feedback*.rs` — state reconstruction via the feedback stream (drain all values at or below resume frontier, then transition to normal mint mode).
+
+## How to check it
+
+Workload procedure:
+1. Produce N messages; wait for source to ingest them.
+2. Kill clusterd via Antithesis node-termination fault.
+3. Produce M more messages while clusterd is down.
+4. Wait for restart, call `ANTITHESIS_STOP_FAULTS`.
+5. Poll until `offset_committed >= max_produced_offset`.
+6. `assert_sometimes!(clusterd_restart_recovered, "kafka source recovered after clusterd kill")`. Combine with `kafka-source-no-data-duplication` to rule out double-counting; combine with `kafka-source-no-data-loss` to rule out gaps.
+
+## What goes wrong on violation
+
+- Resume offset is wrong (too low → duplicates; too high → gap).
+- UPSERT state is wrong (stale value per key, or missing keys).
+- Source never recovers because remap-shard bootstrap fails.
+
+## Antithesis angle
+
+The most interesting timing is a kill *between* the persist sink's `compare_and_append` returning success and the controller's frontier-report channel actually delivering the new frontier upstream. The source on restart must compute its resume frontier from the durably-recorded shard upper, not from any cached or in-flight state.
+
+For UPSERT specifically: kill during the snapshot phase. The feedback-driven snapshot must restart cleanly and complete with the same final state.
+
+## Dependency
+
+Requires **node-termination faults** to be enabled in the Antithesis tenant. Confirm with the user. Without this fault, the property is vacuous.
+
+## Existing instrumentation
+
+None. Workload-level assertion only, until SUT-side rehydration anchors are added. Candidate SUT anchors: `assert_sometimes!(snapshot_phase_completed, …)` in the upsert operator's snapshot-completion path, and `assert_sometimes!(remap_bootstrap_complete, …)` in `source_reader_pipeline.rs:481`.
+
+## Provenance
+
+Surfaced by: Failure Recovery. Builds on `storage-command-replay-idempotent` and `fault-recovery-exercised`.
diff --git a/test/antithesis/scratchbook/properties/offset-known-not-below-committed.md b/test/antithesis/scratchbook/properties/offset-known-not-below-committed.md
new file mode 100644
index 0000000000000..7b1d830ee91d6
--- /dev/null
+++ b/test/antithesis/scratchbook/properties/offset-known-not-below-committed.md
@@ -0,0 +1,39 @@
+# offset-known-not-below-committed
+
+## Summary
+
+For every Kafka source, the statistics view always reports `offset_known >= offset_committed`. Causally, what the broker has told us is available cannot lag what Materialize has durably ingested.
+
+## Code
+
+- `src/storage/src/statistics.rs` (around line 56-71) — the statistics update path that previously allowed regression. Commit `3e32df1f69` introduced clamping so that on a restart where `offset_known` would be loaded from the broker watermark while `offset_committed` is restored from persist, the metric does not flip into the wrong order.
+
+## How to check it
+
+Workload-side polling:
+
+```sql
+SELECT id, offset_known, offset_committed
+FROM mz_internal.mz_source_statistics_per_worker
+WHERE id = ?
+```
+
+`assert_always!(offset_known >= offset_committed, "kafka source statistics: offset_known < offset_committed")`.
+
+SUT-side: mirror as an `assert_always!` inside the statistics update path itself, immediately after both fields are computed but before the value is published.
+
+## What goes wrong on violation
+
+The lag metric `offset_known - offset_committed` becomes a small negative number that wraps to a huge positive number in dashboards (commonly displayed as `u64` or with `MAX(0, …)` clamping that hides the actual bug). Operational tooling that drives autoscaling or alerting off lag becomes unreliable.
+
+## Antithesis angle
+
+The most interesting timing is the very first sample after a clusterd restart. The order in which the source restores `offset_committed` (from the persist shard upper) and learns `offset_known` (from rdkafka's first metadata response) determines whether the invariant holds during the window where one is set and the other is zero. The fix in commit `3e32df1f69` clamps; Antithesis should verify the clamp covers every interleaving.
+
+## Existing instrumentation
+
+None. Pure workload-side polling assertion, optionally mirrored SUT-side.
+
+## Provenance
+
+Surfaced by: Data Integrity (metrics correctness). Direct regression target for commit `3e32df1f69`.
diff --git a/test/antithesis/scratchbook/properties/reclock-mint-eventually-succeeds.md b/test/antithesis/scratchbook/properties/reclock-mint-eventually-succeeds.md
new file mode 100644
index 0000000000000..ee2fb633240e4
--- /dev/null
+++ b/test/antithesis/scratchbook/properties/reclock-mint-eventually-succeeds.md
@@ -0,0 +1,61 @@
+# reclock-mint-eventually-succeeds
+
+## Summary
+
+Under transient persist outages and competing writers, the reclock mint loop (`compare_and_append` with `UpperMismatch` retry, `src/storage/src/source/reclock.rs:160-166`) eventually completes for every source-frontier advance that has data to bind.
+
+## Code
+
+```rust
+// src/storage/src/source/reclock.rs (around line 150-170)
+loop {
+    match handle.compare_and_append(updates, prev_upper, new_into_upper).await {
+        Ok(()) => break,
+        Err(UpperMismatch { current, .. }) => {
+            self.sync(&current).await;
+            // recompute updates and retry
+        }
+    }
+}
+```
+
+There is no upper bound on this loop. It depends on the persist backend eventually being responsive and on competing writers not livelocking the source.
+
+## Why this is a liveness property
+
+Antithesis's job is to assert that the loop terminates in adversarial schedules. The catalog entry asserts both:
+
+1. The retry path is *exercised* (the loop runs more than once at least once during a run): `Sometimes(saw_cas_retry)`.
+2. The source frontier eventually advances past the contention point: a workload-observable liveness check.
+
+## How to check it
+
+SUT-side anchor:
+- Add an `assert_sometimes!(reclock_cas_retry_succeeded, "reclock: mint compare_and_append retry succeeded")` immediately after a successful `compare_and_append` that was preceded by at least one `UpperMismatch`. The local counter is reset on each `mint()` invocation.
+
+Workload-side liveness check:
+- After injecting persist consensus latency or a competing-writer scenario, observe the source's `offset_committed` advancing in `mz_internal.mz_source_statistics_per_worker`. `assert_sometimes!(source_advanced_post_contention, …)`.
+
+## What goes wrong on violation
+
+The source's frontier stops advancing without any external signal that something is wrong. Health reports `Running`. The reclock operator is in an infinite `compare_and_append` → `UpperMismatch` → `sync` → `compare_and_append` cycle. To an operator looking from outside it looks like Kafka is the problem.
+
+## Antithesis angle
+
+- Inject high persist consensus latency. With many concurrent storage workers (or restart-induced competing writers), the CaS contention rate climbs and the retry loop runs many times. Antithesis tests that progress still happens.
+- Race the metadata fetcher's partition-add against an in-flight mint. The mint is now reckoning with an extended `source_upper`; the CaS retry must recompute updates correctly.
+- Concurrent kill+restart cycles that create competing-writer scenarios.
+
+## Open question (resolved)
+
+Q: Is there any input under which `compare_and_append` returns a non-retryable error and the loop should exit?
+
+A: Yes — `InvalidUsage` errors (handled by `panic!("compare_and_append failed: {invalid_use}")` at `reclock/compat.rs:306`). Those terminate the source. The retry loop only handles `UpperMismatch`. Antithesis fault injection should not produce `InvalidUsage` under correct code; if it does, that is a separate property (`reclock-cas-no-invalid-usage`) but it falls under the broader `kafka-source-no-internal-panic` property already cataloged.
+
+## Existing instrumentation
+
+None. The retry loop is silent.
+
+## Provenance
+
+Surfaced by: Failure Recovery, Distributed Coordination.
diff --git a/test/antithesis/scratchbook/properties/remap-shard-antichain-wellformed.md b/test/antithesis/scratchbook/properties/remap-shard-antichain-wellformed.md
new file mode 100644
index 0000000000000..75a17a7446664
--- /dev/null
+++ b/test/antithesis/scratchbook/properties/remap-shard-antichain-wellformed.md
@@ -0,0 +1,55 @@
+# remap-shard-antichain-wellformed
+
+## Summary
+
+At every Materialize timestamp `t`, the contents of the source's remap shard accumulated to `t` form a well-formed `Antichain<KafkaTimestamp>`. Each source-time element has multiplicity exactly 1; for multi-partition Kafka sources, there is one element per partition range with no overlaps.
+
+## Origin
+
+This invariant is stated explicitly in the `ReclockOperator` doc comment (`src/storage/src/source/reclock.rs:31-34`):
+
+> "The `ReclockOperator` will always maintain the invariant that for any time `IntoTime` the remap collection accumulates into an Antichain where each `FromTime` timestamp has frequency `1`. In other words the remap collection describes a well formed `Antichain<FromTime>` as it is marching forwards."
+
+## Code paths
+
+- `src/storage/src/source/reclock.rs:118-169` — `ReclockOperator::mint`. Each call:
+  1. Emits retractions (`-1`) of the prior `source_upper`.
+  2. Emits insertions (`+1`) of the new `source_upper`.
+  3. Calls `compare_and_append` on the remap shard.
+  4. On `UpperMismatch`, `sync()` and retry.
+- `src/storage/src/source/reclock.rs:124` — `assert!(!new_into_upper.less_equal(&binding_ts))` guards the mint precondition.
+- `src/storage/src/source/reclock.rs:321` — `assert!(prev < RB::before(pid))` guards the partition-range ordering.
+- `src/storage/src/source/reclock/compat.rs:144` — `assert!` on persist handle state.
+- `src/storage/src/source/reclock/compat.rs:306` — `panic!("compare_and_append failed: {invalid_use}")` for genuinely invalid CaS calls.
+
+## Antithesis form
+
+Two complementary checks:
+
+1. **SUT-side** inside `ReclockOperator::sync` / `mint`, after every update: walk the local accumulated state and `assert_always!(antichain_wellformed, "reclock: remap shard accumulates to well-formed antichain")` — every source-time element has multiplicity 1. This is the tightest expression of the invariant.
+
+2. **Workload-side** as a periodic SQL probe: select the remap shard's contents (via `mz_internal` introspection views if available) and verify the well-formed property externally. This catches the case where the SUT-side check is correct but the durable persist state diverges.
+
+## What goes wrong on violation
+
+A malformed remap antichain corrupts every subsequent restart's resume frontier. The source either skips data (resume frontier too far ahead), re-reads data (too far back), or panics in downstream operators that depend on well-formed antichains (e.g., the as_of computation in commit `e3805ad790`).
+
+## Antithesis angle
+
+- Concurrent reclock writers across restart: kill the storage worker mid-mint, restart, the new worker must `sync()` the durable state and re-mint from there. If `sync()` is wrong, the new worker may insert without retracting, breaking multiplicity.
+- Partition adds/removes interleaved with mints: the partition-range encoding in `RangeBound<PartitionId>` is the part that has to stay consistent across discovery and binding.
+- `compare_and_append` retry loop interactions: the historical bug at reclock.rs:160-166 was retried correctly, but the cached upper drift (commit `e3805ad790`) bypassed it.
+
+## Open question (resolved)
+
+Q: Can the in-memory `source_upper` and the persisted remap state ever diverge enough that the operator emits a malformed update batch?
+
+A: The `MutableAntichain<FromTime>` in `ReclockOperator::source_upper` is the source of truth for what *should* be persisted next. `mint()` constructs the update batch by diffing the new desired upper against the current `source_upper`. The retraction-insertion structure is what preserves the antichain-multiplicity invariant. The only divergence path is if `sync()` after `UpperMismatch` reads a state inconsistent with what `source_upper` thinks — i.e., a true persist corruption. The assertion at compat.rs:144 is meant to catch this.
+
+## Existing instrumentation
+
+The `assert!` and `panic!` calls at reclock.rs:124, :321 and compat.rs:144, :306 exist. None of them check the *accumulated antichain* property directly — they check local invariants. The recommended new assertion is a `assert_always!` over the in-memory accumulator that runs at every state transition.
+
+## Provenance
+
+Surfaced by: Data Integrity, Distributed Coordination. Foundational invariant for the entire reclocking subsystem.
diff --git a/test/antithesis/scratchbook/properties/upsert-decode-error-retractable.md b/test/antithesis/scratchbook/properties/upsert-decode-error-retractable.md
new file mode 100644
index 0000000000000..850914b374346
--- /dev/null
+++ b/test/antithesis/scratchbook/properties/upsert-decode-error-retractable.md
@@ -0,0 +1,42 @@
+# upsert-decode-error-retractable
+
+## Summary
+
+An `UpsertError` (key decode failure, null key, or value decode failure) for a key is retracted once a subsequent valid `(key, value)` message for the same key is ingested. After settling, the source reflects the corrected value and contains no remaining error row for that key.
+
+This is the upsert envelope's recovery contract for upstream schema mistakes — "fix the bad message and continue" without dropping the source.
+
+## Code paths
+
+- `src/storage/src/render/sources.rs` — `upsert_commands` (line ~509-560 and following): maps decode failures to `UpsertError::NullKey` / `KeyDecode` / `Value`. The result still flows through the upsert pipeline keyed by `UpsertKey::from_key(Err(&err))` so a future good value can retract it.
+- `src/storage-types/src/errors.rs:161-199` — `EnvelopeError::Upsert(UpsertError)` is the *retractable* error variant. `EnvelopeError::Flat(text)` is explicitly *not retractable*.
+- `src/storage/src/upsert.rs:748-750` — error emission paths.
+
+## How to check it
+
+Workload procedure:
+1. Produce a malformed message for key `K` (e.g., invalid Avro under a schema-registry-backed source, or null key on a non-null-key source).
+2. Verify the source contains an error row keyed by `K`.
+3. Produce a valid `(K, value)` message.
+4. After quiet period, `assert_always!(upsert_error_retracted, "upsert: bad value retracted by subsequent good value")` checking that `SELECT * FROM source WHERE key = K` returns exactly one row with `value`, no error row.
+
+## What goes wrong on violation
+
+If the error is not retractable, the source carries a stuck error row that nothing can clear — the only recovery is to drop and re-create the source.
+
+## Distinguishing retractable from non-retractable
+
+This property targets `EnvelopeError::Upsert(_)` only. `EnvelopeError::Flat(_)` is explicitly non-retractable and should not be tested with this property. Workloads must take care to produce errors that map to the Upsert variant — null key, malformed key/value under upsert mode — rather than envelope-fatal errors.
+
+## Antithesis angle
+
+- Race the bad and good messages closely. Verify ordering is preserved.
+- Crash clusterd between the bad message ingesting and the good message ingesting. The error row must persist across the restart and the good message must retract it on resume.
+
+## Existing instrumentation
+
+None. Workload-side check.
+
+## Provenance
+
+Surfaced by: Protocol Contracts, Failure Recovery.
diff --git a/test/antithesis/scratchbook/properties/upsert-ensure-decoded-called-before-access.md b/test/antithesis/scratchbook/properties/upsert-ensure-decoded-called-before-access.md
new file mode 100644
index 0000000000000..244fb4a4ed01d
--- /dev/null
+++ b/test/antithesis/scratchbook/properties/upsert-ensure-decoded-called-before-access.md
@@ -0,0 +1,43 @@
+# upsert-ensure-decoded-called-before-access
+
+## Summary
+
+The six `StateValue` accessors that require the cell to be in `Value` form are always called after `ensure_decoded` has been called on that cell — the panics that currently guard the type-state protocol never fire.
+
+## Targeted sites
+
+`src/storage/src/upsert/types.rs`:
+
+| Line | Accessor | Message |
+|------|----------|---------|
+| 297 | `into_decoded` | `panic!("called \`into_decoded without calling \`ensure_decoded\`")` |
+| 369 | `into_provisional_value` | `panic!("called \`into_provisional_value\` without calling \`ensure_decoded\`")` |
+| 403 | `into_provisional_tombstone` | `panic!("called \`into_provisional_tombstone\` without calling \`ensure_decoded\`")` |
+| 416 | `provisional_order` | `panic!("called \`provisional_order\` without calling \`ensure_decoded\`")` |
+| 430 | `provisional_value_ref` | `panic!("called \`provisional_value_ref\` without calling \`ensure_decoded\`")` |
+| 440 | `into_finalized_value` | `panic!("called \`into_finalized_value\` without calling \`ensure_decoded\`")` |
+
+Each becomes `assert_unreachable!("upsert: <accessor> on Consolidating StateValue")` with a distinct, accessor-specific message.
+
+## Why this is a real property, not just dead code
+
+Two reasons.
+
+1. **Refactor net.** The upsert operator has been rewritten twice (`upsert_classic`, `upsert_continual_feedback`, `upsert_continual_feedback_v2`). Every rewrite added new call sites that touch `StateValue`. A future refactor that forgets to call `ensure_decoded` would today abort clusterd; with the Antithesis SDK in place, it surfaces as a property failure during the very first nightly run after the change.
+2. **Replay anchors.** If Antithesis ever does trip one of these, the failure pinpoints the exact accessor and code path. That is materially more useful than a stack trace from a process abort, especially in a multi-replica scenario where the abort is invisible behind clusterd's auto-restart.
+
+## What this property does *not* catch
+
+This property only checks the type-state protocol — "ensure_decoded was called first." It does not check that the consolidating math itself is correct (that is `upsert-state-consolidation-wellformed`). The two are complementary.
+
+## Antithesis angle
+
+These panics are most likely to fire after a code change to the upsert operator's hot path. Antithesis exercises every operator branch with random fault injection — it should reach the rewrite-sensitive accessor sites if any exist. Cost of instrumenting is trivial (rename `panic!` to `assert_unreachable!`); the value is the regression net.
+
+## Existing instrumentation
+
+The `panic!`s already exist. They abort the process on misuse. The work is wrapping each with `assert_unreachable!` so the misuse is reported.
+
+## Provenance
+
+Surfaced by: Wildcard (this is the type-state guard family that doesn't fit a standard focus).
diff --git a/test/antithesis/scratchbook/properties/upsert-key-reflects-latest-value.md b/test/antithesis/scratchbook/properties/upsert-key-reflects-latest-value.md
new file mode 100644
index 0000000000000..90341358df926
--- /dev/null
+++ b/test/antithesis/scratchbook/properties/upsert-key-reflects-latest-value.md
@@ -0,0 +1,63 @@
+# upsert-key-reflects-latest-value
+
+## Summary
+
+At a settled timestamp, every key in an UPSERT-envelope source maps to the value from the last `(key, value)` message produced — or to no row if the last message for that key was a tombstone.
+
+## Code paths
+
+- `src/storage/src/render/sources.rs` — `upsert_commands` converts `DecodeResult` into `(UpsertKey, Option<UpsertValue>, FromTime)`. `UpsertKey` is a SHA-256 of the key bytes (collision probability `2^-128`).
+- `src/storage/src/upsert.rs` — `upsert_classic`: the main operator. For each input update at `from_time`:
+  1. `multi_get(key)` → returns prior value + prior order key.
+  2. Skip if `from_time <= prior_order` (stale update).
+  3. Emit retraction of prior value at the new timestamp.
+  4. Emit insertion of new value at the new timestamp.
+  5. `multi_put(key, new_value)` updates the state store.
+- `src/storage/src/upsert_continual_feedback.rs` and `_v2.rs` — alternative implementations driven by persist feedback. Same contract, different consolidation strategy.
+- `src/storage/src/upsert/types.rs` — `StateValue::ensure_decoded` (~line 589) finalizes the XOR-checksum consolidating state into either a `Value` or a `tombstone`. Critical for snapshot replay correctness.
+
+## How to check it
+
+Workload-level:
+1. Workload tracks `expected_state: Map<Key, Option<Value>>` of what was last produced per key.
+2. After fault quiet period, for a sampled set of keys: `SELECT value FROM source WHERE key = ?` and compare to `expected_state[key]`.
+3. `assert_always!(upsert_value_matches_latest_produced, "upsert: key value matches latest produced")` — checked on every sample. If the workload notices a divergence, it logs the diff (expected vs. observed) for replay.
+
+## What goes wrong on violation
+
+The source returns a stale value for a key. The user's downstream MV uses it. The bug is invisible until someone manually compares the source to the upstream system.
+
+## Antithesis angle
+
+- Crash clusterd between `multi_get` and `multi_put`. The next incarnation must reconstruct state correctly from feedback.
+- Race produce ordering: if Kafka delivers `(k, v1)` then `(k, v2)`, the source's order-key tracking must serialize them. Order-key regression caused a historical panic (commit `f177db8286`, materialize#26655).
+- For RocksDB backend: race `multi_put` against the merge operator running async.
+- For multi-replica: both replicas process the same key concurrently (commit `1accbe28b3`).
+
+## Open question (resolved)
+
+Q: Does the workload need to know about the per-source `order_key` to validate, or is `from_time` ordering sufficient?
+
+A: For correctness asserting at quiet periods, the workload only needs the *Kafka* produce order — the operator's job is to translate that into the correct visible value. Since Antithesis injects faults but doesn't reorder Kafka's per-partition delivery, the workload can rely on per-partition produce order to determine `expected_state`. Cross-partition reordering is not a concern because the workload assigns each key to a fixed partition.
+
+## Existing instrumentation
+
+None. Pure workload-side check. Optional SUT anchor: an `assert_sometimes!(upsert_emit_correct_retraction, …)` inside `upsert.rs` after a retraction is emitted whose prior value matched what was stored — this gives Antithesis a positive signal that the prior-value-lookup path is being exercised.
+
+## Implementation status
+
+Implemented 2026-05-11 as `test/antithesis/workload/test/parallel_driver_upsert_latest_value.py`. Three assertion messages, each unique:
+
+| Message | Type | When |
+|---------|------|------|
+| `"upsert: SELECT for key matches latest produced value"` | `always` | Per sampled live key after quiet-period catchup |
+| `"upsert: tombstoned key has no row in source"` | `always` | Per sampled key whose last produced message was a tombstone |
+| `"upsert: source caught up to produced offsets after quiet period"` | `sometimes` | Once per invocation; liveness anchor proving the safety assertions ran against settled data |
+
+Shared helpers introduced for this property and reusable by every subsequent Kafka source property: `helper_pg.py` (resilient pgwire), `helper_kafka.py` (producer + delivery tracker), `helper_quiet.py` (`ANTITHESIS_STOP_FAULTS` wrapper), `helper_random.py` (deterministic randomness with Antithesis SDK), `helper_source_stats.py` (catchup polling on `mz_internal.mz_source_statistics`), `helper_upsert_source.py` (idempotent `CREATE CONNECTION` + `CREATE SOURCE`).
+
+No SUT-side instrumentation added in this pass — that is the candidate work in `properties/upsert-no-internal-panic.md`, `properties/upsert-state-consolidation-wellformed.md`, and `properties/upsert-ensure-decoded-called-before-access.md`.
+
+## Provenance
+
+Surfaced by: Data Integrity, Concurrency. Direct regression target for materialize#26655.
diff --git a/test/antithesis/scratchbook/properties/upsert-no-internal-panic.md b/test/antithesis/scratchbook/properties/upsert-no-internal-panic.md
new file mode 100644
index 0000000000000..e9d097626e601
--- /dev/null
+++ b/test/antithesis/scratchbook/properties/upsert-no-internal-panic.md
@@ -0,0 +1,43 @@
+# upsert-no-internal-panic
+
+## Summary
+
+The upsert operator's explicit `assert!`s and `panic!`s — currently process-aborting guards — never fire under any Antithesis-injected fault sequence. Each site is converted to a uniquely-messaged `assert_always!` / `assert_unreachable!` so a firing surfaces as a reportable Antithesis property failure rather than a clusterd crash.
+
+## Targeted assertion sites
+
+| File | Line | Site | Antithesis form |
+|------|------|------|------------------|
+| `src/storage/src/upsert.rs` | 541 | `assert!(diff.is_positive(), "invalid upsert input")` | `assert_always!(diff.is_positive(), "upsert: input diff positive (classic)")` |
+| `src/storage/src/upsert.rs` | 636 | `panic!("key missing from commands_state")` | `assert_unreachable!("upsert: key missing from commands_state (classic)")` |
+| `src/storage/src/upsert.rs` | 1031 | `unreachable!("pending future never returns")` | `assert_unreachable!("upsert: pending future returned (classic)")` |
+| `src/storage/src/upsert_continual_feedback.rs` | 626 | `assert!(diff.is_positive(), "invalid upsert input")` | `assert_always!(diff.is_positive(), "upsert: input diff positive (cf v1)")` |
+| `src/storage/src/upsert_continual_feedback.rs` | 800 | `panic!("key missing from commands_state")` | `assert_unreachable!("upsert: key missing from commands_state (cf v1)")` |
+| `src/storage/src/upsert_continual_feedback_v2.rs` | 315 | `assert!(diff.is_positive(), "invalid upsert input")` | `assert_always!(diff.is_positive(), "upsert: input diff positive (cf v2)")` |
+| `src/storage/src/upsert_continual_feedback_v2.rs` | 483 | `unreachable!()` on `(None, None)` from joined prior/new state | `assert_unreachable!("upsert: cf v2 join produced (None, None)")` |
+| `src/storage/src/upsert/types.rs` | 580 | `panic!("merge_update_state called with non-consolidating state")` | `assert_unreachable!("upsert: merge_update_state on non-Consolidating state")` |
+| `src/storage/src/upsert/types.rs` | 1062 | `panic!("attempted completion of already completed upsert snapshot")` | `assert_unreachable!("upsert: snapshot completion called twice")` |
+
+Each message is unique; an Antithesis failure report names exactly the site that was reached.
+
+## Why these sites
+
+These are structural invariants the operator's authors believed to be impossible. Bug history confirms several have fired in production (commits `f177db8286`, `1accbe28b3`). The cost of wrapping them with the Antithesis SDK is trivial; the upside is reportable, replayable property failures.
+
+## Antithesis angle
+
+- Multi-replica clusters: most relevant for `key missing from commands_state` and the `unreachable!` on `(None, None)`.
+- Order-key edge cases: maps to the `assert!(diff.is_positive())` family.
+- Snapshot completion: the `panic!("attempted completion of already completed upsert snapshot")` is reached if the snapshot-completion state machine is re-entered (rehydration after a crash that already completed snapshot).
+
+## Relationship to other properties
+
+This property is the *operator-internal* counterpart to `upsert-state-consolidation-wellformed` (which guards the math in `ensure_decoded`) and `upsert-ensure-decoded-called-before-access` (which guards the type-state protocol on `StateValue` accessors). Together they form the SUT-side instrumentation backbone for the UPSERT envelope.
+
+## Existing instrumentation
+
+The `assert!` / `panic!` calls already exist as process-aborting guards. They abort in test today; the work is converting them to `assert_always!`/`assert_unreachable!` so failures are *reported* rather than masked as "clusterd was restarted." Each site gets a distinct, specific message per the property-catalog requirement that assertion messages be unique.
+
+## Provenance
+
+Surfaced by: Concurrency, Failure Recovery. Regression targets: commits `f177db8286`, `1accbe28b3`, materialize#26655, database-issues#9160.
diff --git a/test/antithesis/scratchbook/properties/upsert-state-consolidation-wellformed.md b/test/antithesis/scratchbook/properties/upsert-state-consolidation-wellformed.md
new file mode 100644
index 0000000000000..d65161bba6766
--- /dev/null
+++ b/test/antithesis/scratchbook/properties/upsert-state-consolidation-wellformed.md
@@ -0,0 +1,75 @@
+# upsert-state-consolidation-wellformed
+
+## Summary
+
+`StateValue::ensure_decoded` always finalizes a `Consolidating` cell into either a `Value(value)` (when `diff_sum == 1` and the recovered bytes match the stored `len_sum` and seahash `checksum_sum`) or a `tombstone()` (when `diff_sum == 0` and the entire accumulator is zero). Any other state — non-{0,1} `diff_sum`, mismatched checksum, non-zero residue on a tombstone — is an XOR/accounting corruption and must never be observed.
+
+## Code
+
+`src/storage/src/upsert/types.rs:584-682`:
+
+```rust
+pub fn ensure_decoded(&mut self, bincode_opts, source_id, key) {
+    match self {
+        StateValue::Consolidating(consolidating) => {
+            match consolidating.diff_sum.0 {
+                1 => {
+                    let len = usize::try_from(consolidating.len_sum.0)...expect(...);
+                    let value = &consolidating.value_xor.get(..len)...expect(...);
+                    assert_eq!(consolidating.checksum_sum.0, seahash::hash(value) as i64, ...);
+                    *self = Self::finalized_value(bincode_opts.deserialize(value).unwrap());
+                }
+                0 => {
+                    assert_eq!(consolidating.len_sum.0, 0, ...);
+                    assert_eq!(consolidating.checksum_sum.0, 0, ...);
+                    assert!(consolidating.value_xor.iter().all(|&x| x == 0), ...);
+                    *self = Self::tombstone();
+                }
+                other => panic!("invalid upsert state: non 0/1 diff_sum: {other}, ..."),
+            }
+        }
+        StateValue::Value(_) => {}
+    }
+}
+```
+
+## Antithesis form
+
+Each of the four assertions in this function becomes a uniquely-messaged `assert_always!`:
+
+| Existing | Antithesis form | Message |
+|---|---|---|
+| `assert_eq!(checksum_sum, seahash::hash(value))` (621) | `assert_always!(checksum_sum == seahash::hash(value), …)` | `"upsert: consolidating checksum_sum mismatch (diff_sum=1)"` |
+| `assert_eq!(len_sum, 0)` (632) | `assert_always!(len_sum == 0, …)` | `"upsert: consolidating len_sum nonzero (diff_sum=0)"` |
+| `assert_eq!(checksum_sum, 0)` (637) | `assert_always!(checksum_sum == 0, …)` | `"upsert: consolidating checksum_sum nonzero (diff_sum=0)"` |
+| `assert!(value_xor.iter().all(==0))` (642) | `assert_always!(value_xor.iter().all(==0), …)` | `"upsert: consolidating value_xor nonzero (diff_sum=0)"` |
+| `panic!("invalid upsert state: non 0/1 diff_sum: {other}, …")` (672) | `assert_always!(false, …)` | `"upsert: consolidating diff_sum not in {0,1}"` |
+
+Plus the two `expect("invalid upsert state")` calls at 606 and 619 (slice-into-bytes failures); these should become `assert_always!(value_xor.len() >= len, …)` with a distinct message.
+
+## What goes wrong on violation
+
+The XOR-based consolidation collapses many `(diff, bytes)` updates per key into a single accumulator. The math only works if every retraction is exactly paired with its insertion. A trip into the non-{0,1} branch indicates one of:
+
+- A duplicate retraction (commit `1accbe28b3` style multi-replica double-drain).
+- A retraction without a matching insertion in the replay stream (incomplete feedback delivery across crash).
+- A `seahash` collision (negligible probability — if seen, it's a bug elsewhere, not the hash).
+- A bug in the `merge_update_state` math (`upsert/types.rs:533+`).
+
+## Antithesis angle
+
+- Kill clusterd mid-feedback-replay; restart and assert that `ensure_decoded` always completes cleanly.
+- Multi-replica with concurrent drains feeding the same RocksDB backend.
+- Race RocksDB's async merge operator against `multi_put`.
+
+## Why this is the deepest signal
+
+The XOR/checksum consolidation is the *math*: if this assertion ever trips, something upstream — feedback delivery, retraction emission, or order-key tracking — produced an inconsistent update sequence. The signal is high because the assertion is at the *bottom* of the pipeline; everything else has had a chance to introduce the bug, but only this site can detect it.
+
+## Existing instrumentation
+
+The runtime `panic!` and `assert!`s already exist and would abort clusterd on violation. Today, an abort in test looks like "the storage worker crashed" — possibly retried, possibly noticed only via a log scrape. Wrapping them with Antithesis assertions turns each into a reportable, replay-anchored property failure with a unique signature.
+
+## Provenance
+
+Surfaced by: Data Integrity, Concurrency (via the multi-replica drain bug history).
diff --git a/test/antithesis/scratchbook/properties/upsert-state-rehydrates-correctly.md b/test/antithesis/scratchbook/properties/upsert-state-rehydrates-correctly.md
new file mode 100644
index 0000000000000..336deb408759b
--- /dev/null
+++ b/test/antithesis/scratchbook/properties/upsert-state-rehydrates-correctly.md
@@ -0,0 +1,46 @@
+# upsert-state-rehydrates-correctly
+
+## Summary
+
+After a clusterd restart, the rehydrated upsert state — observed via `SELECT * FROM source` — equals the state at the most recent durable timestamp before the restart, for every key produced so far.
+
+## Code paths
+
+- `src/storage/src/upsert.rs:791-799` — snapshot phase: drain input at `resume_upper` boundary, all snapshot values marked with `provisional_order = None` (sorts lowest).
+- `src/storage/src/upsert/types.rs:1062` — `panic!("attempted completion of already completed upsert snapshot")` is the guard for the snapshot-completion state machine.
+- `src/storage/src/upsert/types.rs:584-682` — `StateValue::ensure_decoded` finalizes the consolidating state. The `diff_sum ∈ {0, 1}` invariant must hold at completion time.
+- `src/storage/src/upsert_continual_feedback.rs` — the continual-feedback variant uses a persist `Listen` to receive feedback values; the same correctness contract applies.
+
+## How to check it
+
+Workload procedure:
+1. Produce many `(key, value)` and `(key, null)` messages; track `expected_state`.
+2. Wait for `offset_committed` to advance past last produced offset.
+3. Snapshot `expected_state` and the source's `SELECT * FROM source` content side-by-side; assert equality.
+4. Kill clusterd; wait for restart and quiet period.
+5. Re-run the comparison: `SELECT * FROM source` must equal the pre-kill snapshot.
+6. `assert_always!(upsert_state_rehydrated_correctly, "upsert: rehydrated state equals pre-restart state")`.
+
+## What goes wrong on violation
+
+The source comes back with wrong values per key, missing keys, or keys that should be tombstoned but are present. The bug is silent — the source reports healthy and the workload sees plausible-but-wrong data.
+
+## Antithesis angle
+
+The interesting window is between the persist sink's `compare_and_append` succeeding for batch N and the upsert operator's *next* snapshot-completion. If a crash drops feedback delivery between those two points, the next incarnation's snapshot may see partial state and complete with the wrong tombstone/value mapping.
+
+Compounded by RocksDB merge operator behavior (commit `0d8d740b47`): if the merge operator interleaves with snapshot completion in a way that drops a tombstone, the rehydrated state diverges.
+
+## Dependencies
+
+- Requires node-termination faults enabled.
+- Combine with `upsert-state-consolidation-wellformed` (the deeper `ensure_decoded` correctness check) for full coverage of the snapshot path.
+- Combine with `kafka-source-no-data-duplication` to rule out the related failure mode where rehydration introduces duplicates rather than wrong values.
+
+## Existing instrumentation
+
+None. Candidate SUT anchors: an `assert_sometimes!(upsert_snapshot_completed, "upsert: snapshot phase completed")` at the snapshot-completion call site, and `assert_always!(diff_sum_in_range, …)` mirroring the existing `panic!` in `ensure_decoded`.
+
+## Provenance
+
+Surfaced by: Failure Recovery, Data Integrity.
diff --git a/test/antithesis/scratchbook/properties/upsert-tombstone-removes-key.md b/test/antithesis/scratchbook/properties/upsert-tombstone-removes-key.md
new file mode 100644
index 0000000000000..74f5f13a7ba49
--- /dev/null
+++ b/test/antithesis/scratchbook/properties/upsert-tombstone-removes-key.md
@@ -0,0 +1,38 @@
+# upsert-tombstone-removes-key
+
+## Summary
+
+A `(key, null)` tombstone message eventually removes the key from the UPSERT source, and the key stays absent until a non-null value is produced for it.
+
+## Code paths
+
+- `src/storage/src/render/sources.rs` — `upsert_commands` maps `None` value → tombstone signal: `(UpsertKey, None, from_time)`.
+- `src/storage/src/upsert.rs` — `upsert_classic`: on `None` value with existing prior value, emit retraction at new timestamp and `multi_put(key, tombstone)`.
+- `src/storage/src/upsert/types.rs` — `StateValue::tombstone()` constructor; `ensure_decoded` with `diff_sum == 0` produces this state.
+
+## How to check it
+
+Workload procedure:
+1. Produce `(key, v)` to topic.
+2. Wait for source to ingest it; verify row visible.
+3. Produce `(key, null)`.
+4. After quiet period, `assert_always!(tombstoned_key_absent, "upsert: tombstoned key has no row")` checking `SELECT count(*) FROM source WHERE key = ? = 0`.
+5. Bonus: kill clusterd, restart, assert the row is still absent (no resurrection).
+
+## What goes wrong on violation
+
+A deleted row reappears after restart. Compliance and correctness hazard. The likely cause is the snapshot replay misinterpreting a tombstone consolidating state — the `diff_sum == 0` branch of `ensure_decoded` is what guards this.
+
+## Antithesis angle
+
+- Crash between tombstone retraction emit and `multi_put(tombstone)`. The state store is now ahead/behind the persisted output; the snapshot replay on restart is what reconciles.
+- Race `(k, v)`, `(k, null)`, `(k, v')` deliveries: every interleaving must end with `v'` visible.
+- For the no-resurrection half: produce tombstone, wait for `offset_committed` to advance past its offset, then kill clusterd. On restart, the key must not reappear.
+
+## Existing instrumentation
+
+None. Workload-side check. The `StateValue::tombstone` construction path and the `ensure_decoded` tombstone branch are the relevant code; adding `assert_sometimes!(tombstone_emitted, ...)` inside the tombstone-emit path gives a coverage signal.
+
+## Provenance
+
+Surfaced by: Data Integrity, Lifecycle Transitions (delete operations).
diff --git a/test/antithesis/scratchbook/property-catalog.md b/test/antithesis/scratchbook/property-catalog.md
index ffbba999a7031..0645f1e868414 100644
--- a/test/antithesis/scratchbook/property-catalog.md
+++ b/test/antithesis/scratchbook/property-catalog.md
@@ -1,6 +1,6 @@
 ---
-commit: ca6deb6758e651876582ae7d4dec24ce32d87567
-updated: 2026-05-06
+commit: 007c7af9d9970fb2030c7212368b232e0fbc363e
+updated: 2026-05-11
 ---
 
 # Property Catalog: Materialize
@@ -53,6 +53,17 @@ Properties that verify data correctness when crashes, network partitions, and co
 | **Antithesis Angle** | Inject network failures on consensus calls mid-flight. Kill writer after batch is queued but before state is committed. Antithesis explores the window between consensus write and acknowledgment. |
 | **Why It Matters** | Indeterminate errors are the hardest to handle correctly in distributed systems. Duplication or loss here silently corrupts downstream materialized views. Surfaced by: Data Integrity. |
 
+### critical-reader-fence-linearization — Critical Reader Opaque Token Linearizes
+
+| | |
+|---|---|
+| **Type** | Safety |
+| **Priority** | P1 — incorrect fencing allows premature GC causing data loss |
+| **Property** | When two concurrent critical readers attempt compare_and_downgrade_since with mismatched opaque tokens, exactly one succeeds in updating the shard's since. No reader can re-observe an old opaque value after a SeqNo increment. |
+| **Invariant** | `Always`: concurrent compare_and_downgrade_since operations with different opaques result in exactly one mutation. The winner's opaque is durably recorded; the loser gets a mismatch. |
+| **Antithesis Angle** | Inject network delays between state check and state commit. Fail CaS operations after token comparison but before state write. Antithesis explores concurrent reader contention. |
+| **Why It Matters** | Critical readers control garbage collection boundaries. Incorrect fencing allows premature GC, which deletes data needed by active readers. Surfaced by: Data Integrity. |
+
 ## Category 2: Consistency Model Enforcement
 
 Properties that verify Materialize's strict serializability guarantee and timestamp oracle correctness.
@@ -205,13 +216,183 @@ Properties that verify the system reaches interesting states under fault injecti
 | **Antithesis Angle** | Insert data, inject faults (compute replica crash, storage reconnection), then verify the MV eventually shows the data. Antithesis explores whether faults during the incremental update pipeline cause permanent stalls. |
 | **Why It Matters** | This is the end-to-end user-visible correctness property. Materialize's value proposition is that MVs are always up-to-date. Surfaced by: Product Context. |
 
-### critical-reader-fence-linearization — Critical Reader Opaque Token Linearizes
+## Category 7: Kafka Source Ingestion (Append-Only + UPSERT)
+
+Properties specific to the Kafka source ingestion pipeline: `KafkaSourceReader` → `ReclockOperator` → optional decode/UPSERT → `persist_sink`. Both envelopes are covered, with shared properties for reclocking and source-frontier behavior. Workload-level checks compare produced Kafka records against what a SQL `SELECT` over the source returns; SUT-side checks live in the source/upsert/reclock operators.
+
+### kafka-source-no-data-loss — Every Produced Record Is Eventually Visible
+
+| | |
+|---|---|
+| **Type** | Liveness |
+| **Priority** | P0 — primary user-visible contract; "data is in Kafka but not in Materialize" is the worst possible streaming bug |
+| **Property** | After producing a message to a Kafka topic, the Materialize source over that topic eventually contains a row corresponding to that message (NONE envelope) or a row reflecting the latest value for that key (UPSERT envelope). |
+| **Invariant** | `Sometimes(all_produced_records_visible)`: at least once during a run, after a quiet period, the workload observes `COUNT(*) FROM source` >= number of produced records (NONE) or every produced (key, value) pair is reflected in the source state (UPSERT). Liveness, so `Sometimes` on the catch-up event. |
+| **Antithesis Angle** | Network partitions between Materialize and Kafka, clusterd kills mid-ingestion, persist write retries, and rebalances. The interesting timing is the *crash mid-batch* window: some offsets are in persist, some are not, and the resume frontier determines what we re-read. Antithesis explores whether the re-read covers exactly the missing offsets. |
+| **Why It Matters** | This is the headline guarantee of a streaming database. A bug here is silent data loss visible to every user of the source. Supersedes the more generic `source-ingestion-progress` for Kafka specifically. |
+
+### kafka-source-no-data-duplication — No Record Appears Twice After Settling
 
 | | |
 |---|---|
 | **Type** | Safety |
-| **Priority** | P1 — incorrect fencing allows premature GC causing data loss |
-| **Property** | When two concurrent critical readers attempt compare_and_downgrade_since with mismatched opaque tokens, exactly one succeeds in updating the shard's since. No reader can re-observe an old opaque value after a SeqNo increment. |
-| **Invariant** | `Always`: concurrent compare_and_downgrade_since operations with different opaques result in exactly one mutation. The winner's opaque is durably recorded; the loser gets a mismatch. |
-| **Antithesis Angle** | Inject network delays between state check and state commit. Fail CaS operations after token comparison but before state write. Antithesis explores concurrent reader contention. |
-| **Why It Matters** | Critical readers control garbage collection boundaries. Incorrect fencing allows premature GC, which deletes data needed by active readers. Surfaced by: Data Integrity. |
+| **Priority** | P0 — silent duplication corrupts every aggregate downstream MV |
+| **Property** | After settling, the NONE-envelope source contains at most one row per `(partition, offset)` tuple; the UPSERT-envelope source contains at most one row per key. |
+| **Invariant** | `Always`: `SELECT partition, "offset", COUNT(*) FROM source GROUP BY 1,2 HAVING COUNT(*) > 1` returns no rows for NONE; `SELECT key, COUNT(*) FROM source GROUP BY 1 HAVING COUNT(*) > 1` returns no rows for UPSERT. Checked on every assertion firing — must hold on every observation. |
+| **Antithesis Angle** | Reader crashes between persist-sink batch write and `compare_and_append`; rehydration re-reads offsets we already wrote. The protection lives in `last_offsets` filtering (kafka.rs:1158) but only for the *current* incarnation — across restart, idempotency depends on the persist sink and (for UPSERT) the feedback-driven snapshot. Antithesis explores crash/restart timing across batch boundaries. Direct regression target for upsert double-retraction bug (commit 1accbe28b3, database-issues#9160). |
+| **Why It Matters** | Duplicate rows in the source flow into every downstream materialized view's aggregates and joins. Silent and devastating. |
+
+### kafka-source-frontier-monotonic — Source Persist Shard Upper Never Regresses
+
+| | |
+|---|---|
+| **Type** | Safety |
+| **Priority** | P1 — frontier regression panics downstream operators and breaks `AS OF` queries |
+| **Property** | The `upper` frontier of the source's data persist shard never regresses across the lifetime of the source, including across clusterd restarts and `compare_and_append` retries. |
+| **Invariant** | `Always`: observed `upper(t2) >= upper(t1)` for any observation order `t1 < t2`. Checked on every observation in a workload polling loop, and ideally also as a SUT-side `assert_always!` next to the persist sink's `compare_and_append`. |
+| **Antithesis Angle** | Kill clusterd mid-`compare_and_append`; resume the source with a stale cached upper; concurrent reclock and persist-sink writers. Direct regression target for the `as_of`/reclock-upper race (commit e3805ad790, database-issues#8698) and the persist-sink cached upper bug (commit 505dc96aaa). |
+| **Why It Matters** | Frontier regression manifests as panics (`as_of > upper`) or as observably incorrect AS OF queries. Documented invariant for persist. |
+
+### kafka-source-survives-broker-fault — Source Resumes After Broker Connectivity Restored
+
+| | |
+|---|---|
+| **Type** | Liveness |
+| **Priority** | P1 — operational expectation; broker faults are a routine condition |
+| **Property** | After a transient network partition or Kafka broker outage that prevents the source from making progress, once connectivity is restored, the source eventually ingests all messages that were produced during the outage. |
+| **Invariant** | `Sometimes(source_resumes_after_broker_fault)`: at least once per run, after injecting a network fault between materialized and Kafka and then calling `ANTITHESIS_STOP_FAULTS`, the workload observes the source's `COUNT(*)` advance past its pre-fault value. |
+| **Antithesis Angle** | Network partition between the `materialized` container and the Kafka container; persist+metadata stay reachable. Tests rdkafka reconnect, snapshot statistics restoration (commit 0a34b6c79d), and that no permanent stall mode is entered. |
+| **Why It Matters** | Cloud streaming setups routinely see transient Kafka unavailability. A source that gets stuck and never recovers is an outage. |
+
+### kafka-source-survives-clusterd-restart — Source Resumes After clusterd Crash
+
+| | |
+|---|---|
+| **Type** | Liveness |
+| **Priority** | P1 — recovery from clusterd kill is the most common operational fault path |
+| **Property** | After clusterd (storage worker) is killed and restarted, the Kafka source recovers, replays the right resume offsets, and ingests messages produced before, during, and after the restart. |
+| **Invariant** | `Sometimes(source_recovered_after_clusterd_restart)`: after a kill+restart, eventually `COUNT(*) FROM source >= produced_count`. Combined with `kafka-source-no-data-duplication` to also rule out double-counting. |
+| **Antithesis Angle** | Direct test of the `storage-command-replay-idempotent` mechanism end-to-end through Kafka. Antithesis explores crash timing across the reclock mint, persist-sink append, and upsert snapshot-completion windows. Requires node-termination faults to be enabled. |
+| **Why It Matters** | This is the recovery contract the storage controller is built around. Failure here makes every higher-level property meaningless. |
+
+### upsert-key-reflects-latest-value — UPSERT Source Reflects Latest Value Per Key
+
+| | |
+|---|---|
+| **Type** | Safety |
+| **Priority** | P0 — the entire user-visible promise of the UPSERT envelope |
+| **Status** | **Implemented** (workload-side) — `test/antithesis/workload/test/parallel_driver_upsert_latest_value.py`. Two `always()` assertions ("upsert: SELECT for key matches latest produced value", "upsert: tombstoned key has no row in source") plus one `sometimes()` liveness anchor ("upsert: source caught up to produced offsets after quiet period"). |
+| **Property** | At a settled timestamp, for each key produced by the workload, the UPSERT source contains exactly the value from the last `(key, value)` message produced — or no row if the last message for that key was a tombstone. |
+| **Invariant** | `Always`: for every workload-tracked key, `SELECT value FROM source WHERE key = ?` returns the expected value (or empty for tombstoned keys), as determined by the workload's local model of what it produced. Checked after `ANTITHESIS_STOP_FAULTS` quiet periods. |
+| **Antithesis Angle** | Reorder produce timing, kill clusterd between the prior-value lookup (`multi_get`) and the new-value write (`multi_put`), inject delays in the feedback-driven snapshot phase. Tests order-key monotonicity (commit f177db8286), state-backend consistency, and snapshot-completion correctness. |
+| **Why It Matters** | UPSERT semantics — "the source mirrors the upstream key/value store" — is the reason customers pick this envelope. Wrong value per key is silent corruption that flows into all downstream MVs. |
+
+### upsert-tombstone-removes-key — Tombstone Eventually Removes the Key
+
+| | |
+|---|---|
+| **Type** | Safety |
+| **Priority** | P1 — delete semantics are routinely relied on for GDPR/correctness |
+| **Property** | After producing a `(key, null)` tombstone message to the Kafka topic, the UPSERT source eventually contains no row for that key, and the row stays absent until a new non-null value is produced. |
+| **Invariant** | `Always`: at any settled observation after the tombstone has been ingested (resume_upper > tombstone offset), `SELECT * FROM source WHERE key = ?` returns 0 rows. The "no resurrection" half is also `Always`: a key that has been tombstoned and not re-inserted must not reappear after a clusterd restart or rehydration cycle. |
+| **Antithesis Angle** | Race the tombstone against a state-store snapshot completion. Crash clusterd between persist sink writing the retraction and the upsert state recording the tombstone. The `StateValue::Value` -> tombstone path in `upsert/types.rs` is the relevant code; bugs here look like resurrected rows. |
+| **Why It Matters** | A "deleted" row reappearing is both a correctness bug and a compliance hazard. |
+
+### upsert-state-rehydrates-correctly — UPSERT State Reconstructs Exactly After Restart
+
+| | |
+|---|---|
+| **Type** | Safety |
+| **Priority** | P1 — incorrect rehydration produces wrong-but-plausible-looking output |
+| **Property** | After a clusterd restart, the rehydrated upsert state, as observed via `SELECT * FROM source`, equals the state at the most recent durable timestamp before the restart, for every key produced so far. |
+| **Invariant** | `Always`: after a kill+restart quiet period, the workload's local key/value model matches the source's contents for every key whose latest message has `offset <= resume_upper`. Combines with `kafka-source-no-data-duplication` (no double inserts on rehydration) and `upsert-key-reflects-latest-value` (correct value per key). |
+| **Antithesis Angle** | The interesting window is between `compare_and_append` of the persist sink and the upsert operator's feedback-driven snapshot completion. If the feedback replay deduplication is wrong, rehydrated state diverges from durable state. Direct regression target for the upsert snapshot-completion logic in `upsert/types.rs` and `upsert_continual_feedback*`. |
+| **Why It Matters** | Wrong rehydration is silent — the source comes up "healthy" and serves bad data. Hardest class of bug to detect in production. |
+
+### upsert-decode-error-retractable — Bad Value Errors Are Retracted By Subsequent Good Value
+
+| | |
+|---|---|
+| **Type** | Safety |
+| **Priority** | P2 — documented contract; supports operational "fix the bad message and continue" recovery |
+| **Property** | When a Kafka message decoding produces an `UpsertError::Value` (or `UpsertError::KeyDecode` or `UpsertError::NullKey`) for a key, and a subsequent message produces a valid `(key, value)` pair for the same key, the source state for that key transitions from "row containing error" to "row containing the new value" — i.e. the error is retracted. |
+| **Invariant** | `Always`: at a settled timestamp after the corrective message has been ingested, `SELECT * FROM source WHERE key = ?` returns the corrected value with no remaining error row. Note this is the *upsert*-specific retractability (`EnvelopeError::Upsert(..)`); `EnvelopeError::Flat(..)` is explicitly non-retractable. |
+| **Antithesis Angle** | Produce an undecodable value, then a good value for the same key, while injecting delays between the two. Race against snapshot completion (errored value during snapshot vs. corrected value post-snapshot). |
+| **Why It Matters** | Encoded as the operational contract by which users recover from upstream schema mistakes without dropping the source. Code in `upsert_commands` (render/sources.rs) and `upsert.rs` is the relevant path. |
+
+### upsert-no-internal-panic — Upsert Operator's Internal Asserts Never Fire
+
+| | |
+|---|---|
+| **Type** | Reachability (Unreachable) |
+| **Priority** | P1 — these panics are explicit "should-never-happen" guards that bug history has hit |
+| **Property** | The explicit panics and `assert!`s in the upsert operator never fire under any Antithesis-injected fault sequence. Specifically: `assert!(diff.is_positive(), "invalid upsert input")` (upsert.rs:541, upsert_continual_feedback.rs:626, v2:315); `panic!("key missing from commands_state")` (upsert.rs:636, upsert_continual_feedback.rs:800); `unreachable!()` for `(None, None)` in continual-feedback v2 (v2:483); the order-key panic that used to live in `drain_staged_input` (now a skip; commit f177db8286). |
+| **Invariant** | `Unreachable`: each of these sites is converted to an Antithesis `assert_unreachable!("…")` (or `assert_always!(false, …)`) so that any firing produces an explicit Antithesis property failure rather than a process crash. Distinct, unique message per site. |
+| **Antithesis Angle** | These are the high-signal SUT-side anchors. They catch the same family of bugs that historically reached production: order-key regression, missing dedup entry, retraction-on-input. Adding them costs almost nothing in the SUT and gives Antithesis precise replay anchors. |
+| **Why It Matters** | These panics indicate the operator entered an internal state its author thought was impossible. Past bugs (commits f177db8286, 1accbe28b3) reached production exactly through these paths. The asserts already exist; we just need to wrap them with the Antithesis SDK so the failures become reportable properties rather than process kills. |
+
+### upsert-state-consolidation-wellformed — `ensure_decoded` Resolves To `diff_sum ∈ {0, 1}` With Matching Checksums
+
+| | |
+|---|---|
+| **Type** | Safety |
+| **Priority** | P0 — directly guards upsert state-store data integrity; catches XOR/checksum corruption |
+| **Property** | When the upsert state backend's `StateValue::ensure_decoded` finalizes a `Consolidating` cell into either a live `Value` or a `tombstone`, the consolidating accumulator is well-formed: `diff_sum ∈ {0, 1}`; if `diff_sum == 1` the recovered bytes match the recorded `len_sum` and `checksum_sum` (seahash of `value_xor[..len_sum]`); if `diff_sum == 0` then `len_sum == 0`, `checksum_sum == 0`, and every byte of `value_xor` is zero. |
+| **Invariant** | `Always`: the `panic!("invalid upsert state: non 0/1 diff_sum: …")` at `upsert/types.rs:672` becomes an `assert_always!(false, "upsert: non 0/1 diff_sum")` with a unique message. The intermediate `assert_eq!`s at :621, :632, :637 and the `assert!` at :642 are likewise upgraded to `assert_always!` so they report rather than crash. Each site gets a distinct, specific message. |
+| **Antithesis Angle** | The consolidating state collapses many `(diff, bytes)` updates per key into running `diff_sum`, `len_sum`, `checksum_sum`, and an XOR-merged `value_xor` blob. The invariant relies on (a) every retraction being paired with an identical insertion in the snapshot stream, and (b) the snapshot completion contract delivering exactly the durable state at the resume frontier. Antithesis explores: crash mid-snapshot-replay, RocksDB merge operator interleaved with multi_put, partial feedback delivery across restart, and (most subtly) duplicated retractions from multi-replica drain (commit 1accbe28b3). Any of these can break the XOR cancellation and trip a non-{0,1} diff_sum. |
+| **Why It Matters** | This is the deepest "the math broke" guard in the upsert pipeline. A trip here means either the feedback stream replayed wrong contents or a duplicate retraction snuck through. The existing panic already dumps a rich diagnostic — wrapping it as an Antithesis assertion turns it into a reportable, replayable property failure rather than a process abort. |
+
+### upsert-ensure-decoded-called-before-access — Consolidating State Is Always Decoded Before Use
+
+| | |
+|---|---|
+| **Type** | Reachability (Unreachable) |
+| **Priority** | P2 — type-state protocol invariant; high-signal as a replay anchor |
+| **Property** | Every accessor on `StateValue` that requires the cell to be in `Value` form is preceded by a call to `ensure_decoded` for that cell. The six accessor panics — `into_decoded` (297), `into_provisional_value` (369), `into_provisional_tombstone` (403), `provisional_order` (416), `provisional_value_ref` (430), `into_finalized_value` (440) — never fire. |
+| **Invariant** | `Unreachable`: each `panic!("called \`...\` without calling \`ensure_decoded\`")` site is converted to a distinct `assert_unreachable!("upsert: <accessor> on Consolidating")`. Six unique assertion messages, one per accessor, so an Antithesis report distinguishes which contract was violated. These are pure protocol-misuse guards — they cannot fire in valid execution. |
+| **Antithesis Angle** | These panics are most likely to fire after a code change to the upsert operator (e.g. a new code path that forgets `ensure_decoded` before reading `provisional_value`). Antithesis exercises every operator branch under fault injection; turning these into reachability assertions gives a cheap regression-detection net for future refactors of `upsert.rs` / `upsert_continual_feedback*.rs`. They are also useful replay anchors — if Antithesis ever does reach them, the bug is reproducible. |
+| **Why It Matters** | These guard a type-state contract that is currently enforced only at runtime. The cost of instrumenting them is essentially zero (rename `panic!` to `assert_unreachable!`), and the upside is that any future violation surfaces as a property failure that can be replayed deterministically. |
+
+### kafka-source-no-internal-panic — Kafka Source Reader's Explicit Panics Never Fire
+
+| | |
+|---|---|
+| **Type** | Reachability (Unreachable) |
+| **Priority** | P1 — direct regression target for topic-recreation and offset-handling bugs |
+| **Property** | The explicit panics in `kafka.rs` never fire: `panic!("got negative offset (...)")` (kafka.rs:1193); `panic!("unexpected source export details: ...")` (kafka.rs:276); the `assert!(self.last_offsets[output][partition])` (kafka.rs:1142); plus the `expect()` sites on resume-upper / statistics / offset arithmetic. |
+| **Invariant** | `Unreachable`: each site converted to a unique `assert_unreachable!("kafka: <site>")`. The "negative offset" panic in particular is a known structural-invariant violation that has fired before. |
+| **Antithesis Angle** | Topic deletion + recreation, partition rebalancing, manual offset reset on the Kafka broker, clock jumps that interact with Kafka's internal offset arithmetic. Direct regression target for commit 99ad668af5 (capability downgrade on topic recreation). |
+| **Why It Matters** | A panic in the source reader takes down the storage worker. Replacing the panic with an Antithesis assertion gives a *reportable* failure rather than a crash that masks itself as "clusterd was restarted." |
+
+### remap-shard-antichain-wellformed — Remap Shard Accumulates To Well-Formed Antichain
+
+| | |
+|---|---|
+| **Type** | Safety |
+| **Priority** | P1 — load-bearing invariant for reclock correctness; explicitly stated in source doc comment |
+| **Property** | At every Materialize timestamp `t`, the remap shard's contents accumulated to `t` form a well-formed `Antichain<KafkaTimestamp>`: each source-time element has frequency exactly 1, the antichain is not empty if any source data has been bound, and (under multi-partition source) there is one element per partition range with no overlaps. |
+| **Invariant** | `Always`: enforced as an `assert_always!` inside `ReclockOperator::mint`/`sync` after every state update — that's where the doc comment promises the invariant (reclock.rs:31-34). Workload-level approximation: a periodic SQL query that joins source/remap progress with computed offsets and verifies one-to-one. |
+| **Antithesis Angle** | Concurrent reclock writers (across restart), partition adds/removes between mints, `compare_and_append` retries that interleave with metadata refresh. The remap shard is the only place where source-time → into-time is durably recorded; a malformed antichain corrupts every subsequent restart's resume frontier. |
+| **Why It Matters** | This is the foundational reclock invariant. Violation here breaks recovery (resume_upper computed wrong), `AS OF` semantics, and the upsert operator's snapshot phase. |
+
+### reclock-mint-eventually-succeeds — Reclock Mint Completes Despite CaS Retries
+
+| | |
+|---|---|
+| **Type** | Liveness |
+| **Priority** | P2 — pre-existing concern under persist instability |
+| **Property** | Under transient persist outages or competing writers, the reclock mint loop (`compare_and_append` with `UpperMismatch` retry, reclock.rs:160-166) eventually completes for every source-frontier advance that has data to bind. |
+| **Invariant** | `Sometimes(mint_completed_after_cas_retry)`: at least once per run, Antithesis observes a reclock mint that took >1 CaS attempt and then completed (i.e. a successful retry path was exercised). Critically, the workload should also observe that the source frontier eventually advances past the value of `source_upper` captured at the time of the contention — i.e. the loop is not livelocked. |
+| **Antithesis Angle** | Inject persist consensus latency, kill+restart concurrently to create a competing writer, race the metadata fetcher's partition-add against a mint that is already in flight. The retry loop in `mint()` has no upper bound; this property confirms it is not livelocked even under adversarial schedules. |
+| **Why It Matters** | A livelocked mint loop manifests as a source that never advances its frontier — externally indistinguishable from a stalled Kafka consumer, but caused inside Materialize. |
+
+### offset-known-not-below-committed — Source Statistics Causality
+
+| | |
+|---|---|
+| **Type** | Safety |
+| **Priority** | P2 — observable statistics correctness; regression target for commit 3e32df1f69 |
+| **Property** | For every Kafka source, the source-statistics view always reports `offset_known >= offset_committed`. The metric `offset_known` reflects what the broker has told us is available; `offset_committed` reflects what Materialize has durably ingested. Causally, `offset_known` cannot lag `offset_committed`. |
+| **Invariant** | `Always`: a polling assertion in the workload — `SELECT offset_known, offset_committed FROM mz_internal.mz_source_statistics_per_worker WHERE id = ?` — invariant `offset_known >= offset_committed`. Mirror as an `assert_always!` inside the statistics update path in `src/storage/src/statistics.rs`. |
+| **Antithesis Angle** | Clusterd restart resets `offset_known` to broker-reported watermark while `offset_committed` is restored from persist. If the restoration order is wrong, the invariant flips. Direct regression target for commit 3e32df1f69. |
+| **Why It Matters** | The statistics view is consumed by users and by operational tooling to compute lag. A regression in causality makes lag metrics meaningless and is the kind of bug that survives unit tests but fails under adversarial timing. |
diff --git a/test/antithesis/scratchbook/property-relationships.md b/test/antithesis/scratchbook/property-relationships.md
index 4df508acd6f03..b8b250fc37233 100644
--- a/test/antithesis/scratchbook/property-relationships.md
+++ b/test/antithesis/scratchbook/property-relationships.md
@@ -48,9 +48,48 @@ Both test the 0DT deployment pipeline. `deployment-lag-detection` is a prerequis
 
 **Suspected dominance**: `deployment-promotion-safety` is stronger — it requires both lag detection and correct fencing. `deployment-lag-detection` is a liveness check on a subsystem of the promotion pipeline.
 
+## Cluster 7: Kafka Source — User-Visible Ingestion Correctness
+
+**Properties**: `kafka-source-no-data-loss`, `kafka-source-no-data-duplication`, `kafka-source-frontier-monotonic`, `kafka-source-survives-broker-fault`, `kafka-source-survives-clusterd-restart`
+
+End-to-end Kafka source ingestion contract observable from the workload side. `kafka-source-no-data-loss` and `kafka-source-no-data-duplication` are the inverse-pair safety/liveness checks: every produced message must show up *exactly once*. The two recovery properties (`survives-broker-fault`, `survives-clusterd-restart`) exercise the same contract under different fault classes. `kafka-source-frontier-monotonic` is the lower-level safety property that both no-loss and no-duplication depend on.
+
+**Suspected dominance**: `kafka-source-frontier-monotonic` underpins both `no-data-loss` and `no-data-duplication` — if the persist shard upper goes backwards, both higher-level properties fail. `survives-clusterd-restart` strictly implies `survives-broker-fault` for the recovery code path (clusterd restart triggers all the same rehydration logic plus more), but the two stress different fault classes.
+
+## Cluster 8: UPSERT Envelope — Per-Key Semantics
+
+**Properties**: `upsert-key-reflects-latest-value`, `upsert-tombstone-removes-key`, `upsert-state-rehydrates-correctly`, `upsert-decode-error-retractable`
+
+The user-visible UPSERT contract. `upsert-key-reflects-latest-value` is the headline: latest produced value per key wins. `upsert-tombstone-removes-key` is the special-case for `None` values. `upsert-state-rehydrates-correctly` is the post-crash version of `latest-value`. `upsert-decode-error-retractable` is the error-recovery half of the contract — bad messages can be retracted.
+
+**Suspected dominance**: `upsert-state-rehydrates-correctly` implies `upsert-key-reflects-latest-value` in steady state (rehydration produces the right state, and that state is what subsequent operations operate on). `upsert-tombstone-removes-key` is a special case of `upsert-key-reflects-latest-value` (the "last produced was null" case). `upsert-decode-error-retractable` is independent.
+
+## Cluster 9: UPSERT Operator Internals — SUT-Side Asserts
+
+**Properties**: `upsert-no-internal-panic`, `upsert-state-consolidation-wellformed`, `upsert-ensure-decoded-called-before-access`
+
+Operator-internal correctness backbone for the UPSERT envelope. All three properties are about converting existing `panic!`/`assert!` sites in the upsert code into Antithesis-reportable assertions. `upsert-state-consolidation-wellformed` is the math-correctness check (XOR/checksum invariants in `ensure_decoded`); `upsert-ensure-decoded-called-before-access` is the type-state protocol check on `StateValue` accessors; `upsert-no-internal-panic` is the broader umbrella covering the diff-positive / commands-state / snapshot-completion guards.
+
+**Suspected dominance**: `upsert-state-consolidation-wellformed` is the deepest signal — a trip there indicates upstream code already failed to preserve some invariant. `upsert-no-internal-panic`'s `assert!(diff.is_positive())` family catches a similar class of upstream-bug-evidence higher up the stack.
+
+## Cluster 10: Kafka Source Internals — SUT-Side Asserts
+
+**Properties**: `kafka-source-no-internal-panic`, `remap-shard-antichain-wellformed`, `reclock-mint-eventually-succeeds`, `offset-known-not-below-committed`
+
+Reclock and source-reader operator-internal correctness. `remap-shard-antichain-wellformed` is the load-bearing invariant for the entire reclocking subsystem; `reclock-mint-eventually-succeeds` is its liveness companion. `kafka-source-no-internal-panic` is the umbrella for the explicit reader asserts. `offset-known-not-below-committed` is a much narrower statistics-causality check.
+
+**Suspected dominance**: `remap-shard-antichain-wellformed` underpins everything in Cluster 7 — a malformed remap antichain corrupts the resume frontier, which breaks both data-loss and data-duplication properties at the next restart.
+
 ## Cross-Cluster Connections
 
 - `epoch-fencing-prevents-split-brain` (Cluster 2) protects `catalog-recovery-consistency` (Cluster 3) — fencing ensures only one writer during recovery
 - `persist-cas-monotonicity` (Cluster 1) underpins `catalog-recovery-consistency` (Cluster 3) — catalog is stored in persist, so CaS correctness is a prerequisite
 - `strict-serializable-reads` (Cluster 4) depends on `epoch-fencing-prevents-split-brain` (Cluster 2) — split-brain would allow inconsistent timestamp assignments
 - `idempotent-write-under-indeterminate` (Cluster 1) protects `storage-command-replay-idempotent` (Cluster 3) — storage ingestion uses persist writes, so idempotency matters for both
+- `persist-cas-monotonicity` (Cluster 1) underpins `kafka-source-frontier-monotonic` (Cluster 7) — frontier monotonicity at the source level is a direct consequence of CaS monotonicity at the persist level
+- `storage-command-replay-idempotent` (Cluster 3) supports `kafka-source-survives-clusterd-restart` (Cluster 7) — correct command replay is required for source recovery to be idempotent
+- `idempotent-write-under-indeterminate` (Cluster 1) supports `kafka-source-no-data-duplication` (Cluster 7) — the no-duplicate-write guarantee at the persist level is what makes no-data-duplication observable at the source level
+- `remap-shard-antichain-wellformed` (Cluster 10) underpins `kafka-source-no-data-loss` and `kafka-source-no-data-duplication` (Cluster 7) — a malformed remap antichain breaks the resume frontier across restart
+- `upsert-state-consolidation-wellformed` (Cluster 9) underpins `upsert-state-rehydrates-correctly` (Cluster 8) — if the consolidating math is wrong, rehydration is wrong
+- `source-ingestion-progress` (Cluster 4, pre-existing) is now subsumed by `kafka-source-no-data-loss` (Cluster 7) for Kafka specifically; `source-ingestion-progress` remains relevant for non-Kafka sources (Postgres CDC, MySQL CDC, generators)
+- `mv-reflects-source-updates` (Cluster 4) depends on every Cluster 7 and Cluster 8 property — MVs over Kafka sources inherit those sources' correctness
diff --git a/test/antithesis/scratchbook/sut-analysis.md b/test/antithesis/scratchbook/sut-analysis.md
index a0ff7561eed5e..c38442d9d96c0 100644
--- a/test/antithesis/scratchbook/sut-analysis.md
+++ b/test/antithesis/scratchbook/sut-analysis.md
@@ -215,3 +215,84 @@ Materialize is organized into three logical layers that run as separate processe
 - What is the preferred metadata store for Antithesis testing — CockroachDB or PostgreSQL?
 - Should we test with multiple compute replicas or single replica?
 - Are there specific failure scenarios the Materialize team wants prioritized?
+
+## Appendix A: Kafka Source Ingestion (Detail)
+
+Added 2026-05-11 in response to scoping toward Kafka source properties (append-only + UPSERT envelope).
+
+### Pipeline shape
+
+`KafkaSourceReader` → `ReclockOperator` → (optional `decode`) → (optional `upsert` operator) → `persist_sink`.
+
+The dataflow is rendered in `src/storage/src/render/sources.rs`. The reader and metadata-fetcher are constructed by `SourceRender for KafkaSourceConnection` in `src/storage/src/source/kafka.rs`. Reclocking is in `src/storage/src/source/reclock.rs` plus `reclock/compat.rs` (the persist-backed remap handle). UPSERT logic is in `src/storage/src/upsert.rs` (classic) and `src/storage/src/upsert_continual_feedback.rs` / `upsert_continual_feedback_v2.rs` (continual-feedback variants).
+
+### Source-time vs into-time
+
+* **Source time** for Kafka is `Partitioned<RangeBound<PartitionId>, MzOffset>` (`mz_storage_types::sources::kafka`). The frontier is a multi-partition antichain.
+* **Into time** is Materialize's `mz_repr::Timestamp` (ms since epoch). The mapping from source time → into time is the *remap shard*: a persist shard whose contents accumulate to a well-formed `Antichain<FromTime>` at every into-time. See `ReclockOperator` doc comment: "for any time `IntoTime` the remap collection accumulates into an Antichain where each `FromTime` timestamp has frequency `1`."
+* On startup the remap operator loads existing bindings, downgrades to the recovered upper, then mints new bindings when `mint()` receives a probe.
+
+### Partition handling
+
+* Partition → worker assignment is round-robin by hash: `((source_id + partition_id) % worker_count) == worker_id` (`kafka.rs`).
+* New partitions are picked up by the metadata fetcher and routed through reclocking.
+* Per-partition offsets are tracked in `last_offsets`. Code-stated invariant: "if we see offset x, we have seen all offsets [0, x-1] that we are ever going to see" (kafka.rs near line 1005).
+* Offsets that arrive `<=` `last_offset` are silently dropped (kafka.rs ~1158). This is the path that protects against rdkafka redelivery on reconnect.
+* Negative offsets from an otherwise non-errored message cause `panic!` in `construct_source_message` (kafka.rs ~1193).
+
+### Append-only (NONE envelope) workload shape
+
+Decoded rows flow directly into `persist_sink` keyed by Materialize timestamp. Each `(partition, offset)` produces exactly one row (plus metadata columns if requested). There is no retraction unless an upstream EvalError occurs in a downstream operator.
+
+### UPSERT envelope
+
+`upsert_commands` (render/sources.rs) maps each `DecodeResult` into `(UpsertKey, Option<UpsertValue>, FromTime)`:
+
+* `UpsertKey` is a 32-byte SHA-256 digest of the key bytes; collisions are treated as impossible (probabilistic).
+* `Some(value)` is an insert/update for `key`; `None` is a tombstone (delete).
+* Key decode failures produce `UpsertError::KeyDecode`; null keys produce `UpsertError::NullKey`; value decode failures produce `UpsertError::Value`. These flow as `Err` values keyed by the (errored) key and can be *retracted* by a subsequent good `(key, value)` for the same key — this is the contract that makes "fix the bad message" recovery possible without dropping the source.
+
+The upsert operator (`upsert_classic` in `upsert.rs`) consults a state store (`UpsertStateBackend`) for the prior value before emitting updates. Two backends ship:
+
+* `InMemoryHashMap` — `BTreeMap<UpsertKey, StateValue>`. Lost on restart.
+* `RocksDB` — persistent, with a merge operator. Bug history shows the merge operator must always return `Some` or RocksDB aborts the process (commit 0d8d740b47).
+
+State is reconstructed on restart by replaying the persist *feedback* stream (the output of the upsert operator's previous incarnation) up to the resume frontier. The operator passes through a *snapshot* phase that drains all feedback values for keys at or below the resume frontier, then transitions to normal mint-on-input mode.
+
+Key invariants stated in code:
+
+* `assert!(diff.is_positive(), "invalid upsert input")` (upsert.rs:541; mirrored in `upsert_continual_feedback*.rs`) — the upsert operator never sees retractions on its input; only inserts/tombstones.
+* `panic!("key missing from commands_state")` (upsert.rs:636) — the operator's internal dedup table must always contain a key it is about to emit for; missing key is a structural invariant violation.
+* Order-key monotonicity within a key is enforced by `consolidate_snapshot_chunk` / `drain_staged_input`. A regression here previously caused a panic that was "as close to data loss as possible" (commit f177db8286, issue materialize#26655). The fix skips violating updates rather than panicking.
+* In continual-feedback v2: `assert!(diff.is_positive())` again (v2:315) plus `unreachable!()` on `(None, None)` from joined prior/new state (v2:483) and an empty-output assertion in tests (v2:957).
+
+### Reclock invariants and failure modes
+
+* `compare_and_append` on the remap shard can return `UpperMismatch` if a racing writer (e.g. across restart) has advanced the shard. `ReclockOperator::mint` retries by `sync()`-ing and re-minting (reclock.rs:160-166).
+* `panic!("compare_and_append failed: {invalid_use}")` in `reclock/compat.rs:306` catches genuinely invalid persist calls (vs. retryable upper mismatch).
+* Reclock's cached `upper` has a known staleness pitfall (commit e3805ad790, issue database-issues#8698) — fixed by always fetching the recent upper for `as_of` calculation.
+
+### Statistics and progress signals
+
+`statistics.rs` reports per-source counters that have correctness invariants of their own:
+
+* `offset_known >= offset_committed` (commit 3e32df1f69 enforces clamping after a regression bug).
+* `snapshot_records_known >= snapshot_records_staged`, both decrease to zero (clear) at end of snapshot.
+
+These are user-visible numbers and form weak but easily-checkable correctness signals from the workload side.
+
+### Failure-prone areas relevant to Antithesis
+
+| Area | Risk | Code |
+|------|------|------|
+| Negative offset from rdkafka | hard panic | kafka.rs:1193 |
+| Late offset on reconnect | silent drop (correct behavior, but check via `assert_sometimes!(saw_late_offset)`) | kafka.rs:1158 |
+| Topic recreated with fewer offsets | previously panicked on capability downgrade (commit 99ad668af5) | source_reader_pipeline / kafka.rs |
+| Upsert key with timestamp regression | previously panicked (commit f177db8286) | upsert.rs:475-487 |
+| RocksDB merge returning `None` | SIGABRT (commit 0d8d740b47) | upsert/rocksdb.rs |
+| Reclock `compare_and_append` UpperMismatch retry loop | unbounded retry, can block forever under persist outage | reclock.rs:160 |
+| Multi-replica `drain_staged_input` double-pass | duplicate retractions (commit 1accbe28b3) | upsert_continual_feedback.rs |
+| Persist sink cached upper across concurrent sinks | stale read leads to false errors (commit 505dc96aaa) | render/persist_sink.rs |
+| Flag flip mid-append on persist sink | spurious `InvalidBatchBounds` (commit 68e1dfd86d) | render/persist_sink.rs |
+
+These are the seeds for the Kafka-specific property catalog in Category 7 of `property-catalog.md`.
diff --git a/test/antithesis/workload/test/helper_kafka.py b/test/antithesis/workload/test/helper_kafka.py
new file mode 100644
index 0000000000000..a9bf2eac600a1
--- /dev/null
+++ b/test/antithesis/workload/test/helper_kafka.py
@@ -0,0 +1,90 @@
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Thin confluent-kafka producer wrapper for Antithesis drivers.
+
+Tracks the highest delivered offset per topic so drivers can poll Materialize
+statistics for catchup. Retries delivery failures on partition; surfaces
+permanent errors.
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+import threading
+from dataclasses import dataclass, field
+
+from confluent_kafka import KafkaException, Producer
+from confluent_kafka.admin import AdminClient, NewTopic
+
+LOG = logging.getLogger("antithesis.helper_kafka")
+
+BROKER = os.environ.get("KAFKA_BROKER", "kafka:9092")
+
+
+@dataclass
+class DeliveryTracker:
+    """Records highest delivered offset per (topic, partition) and any error."""
+
+    max_offset: dict[tuple[str, int], int] = field(default_factory=dict)
+    last_error: KafkaException | None = None
+    _lock: threading.Lock = field(default_factory=threading.Lock)
+
+    def callback(self, err, msg):
+        if err is not None:
+            with self._lock:
+                self.last_error = KafkaException(err)
+            LOG.warning("kafka delivery error: %s", err)
+            return
+        key = (msg.topic(), msg.partition())
+        with self._lock:
+            existing = self.max_offset.get(key, -1)
+            if msg.offset() > existing:
+                self.max_offset[key] = msg.offset()
+
+    def topic_max_offset(self, topic: str) -> int:
+        with self._lock:
+            offsets = [o for (t, _), o in self.max_offset.items() if t == topic]
+        return max(offsets) if offsets else -1
+
+
+def make_producer(client_id: str | None = None) -> tuple[Producer, DeliveryTracker]:
+    """Construct a Producer with a fresh DeliveryTracker."""
+    config: dict[str, object] = {
+        "bootstrap.servers": BROKER,
+        "linger.ms": 5,
+        "enable.idempotence": True,
+        "acks": "all",
+    }
+    if client_id:
+        config["client.id"] = client_id
+    return Producer(config), DeliveryTracker()
+
+
+def ensure_topic(topic: str, num_partitions: int = 1) -> None:
+    """Create the topic if it doesn't already exist. No-op on race with auto-create."""
+    admin = AdminClient({"bootstrap.servers": BROKER})
+    existing = admin.list_topics(timeout=10).topics
+    if topic in existing:
+        return
+    LOG.info("creating kafka topic %s with %d partition(s)", topic, num_partitions)
+    futures = admin.create_topics(
+        [NewTopic(topic, num_partitions=num_partitions, replication_factor=1)]
+    )
+    for t, fut in futures.items():
+        try:
+            fut.result(timeout=30)
+        except KafkaException as exc:
+            # TOPIC_ALREADY_EXISTS = 36
+            err = exc.args[0] if exc.args else None
+            if err is not None and getattr(err, "code", lambda: None)() == 36:
+                LOG.info("kafka topic %s raced with auto-create; continuing", t)
+                continue
+            raise
diff --git a/test/antithesis/workload/test/helper_pg.py b/test/antithesis/workload/test/helper_pg.py
new file mode 100644
index 0000000000000..d90babf162baf
--- /dev/null
+++ b/test/antithesis/workload/test/helper_pg.py
@@ -0,0 +1,120 @@
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Resilient Materialize/pgwire connection helpers for Antithesis drivers.
+
+The workload runs under active fault injection. Every call retries network and
+admission errors transparently; everything else propagates.
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+import time
+from collections.abc import Iterator, Sequence
+from contextlib import contextmanager
+from typing import Any
+
+import psycopg
+
+LOG = logging.getLogger("antithesis.helper_pg")
+
+PGHOST = os.environ.get("PGHOST", "materialized")
+PGPORT = int(os.environ.get("PGPORT", "6875"))
+PGUSER = os.environ.get("PGUSER", "materialize")
+PGDATABASE = os.environ.get("PGDATABASE", "materialize")
+
+# Retry tuning. Antithesis injects partitions and node hangs; conservative bounds
+# keep drivers progressing without masking real correctness signals.
+_CONNECT_TIMEOUT_S = 5
+_RETRY_BUDGET_S = 60
+_RETRY_INITIAL_S = 0.1
+_RETRY_MAX_S = 2.0
+
+
+def _retryable(exc: BaseException) -> bool:
+    if isinstance(exc, psycopg.OperationalError):
+        return True
+    # psycopg wraps server-side admin shutdowns as InterfaceError on next op.
+    if isinstance(exc, psycopg.InterfaceError):
+        return True
+    return False
+
+
+@contextmanager
+def connect(autocommit: bool = True) -> Iterator[psycopg.Connection]:
+    """Yield a connection, retrying transient failures up to RETRY_BUDGET_S."""
+    deadline = time.monotonic() + _RETRY_BUDGET_S
+    backoff = _RETRY_INITIAL_S
+    while True:
+        try:
+            conn = psycopg.connect(
+                host=PGHOST,
+                port=PGPORT,
+                user=PGUSER,
+                dbname=PGDATABASE,
+                connect_timeout=_CONNECT_TIMEOUT_S,
+                autocommit=autocommit,
+            )
+            break
+        except Exception as exc:  # noqa: BLE001
+            if not _retryable(exc) or time.monotonic() > deadline:
+                raise
+            LOG.info("pg connect retrying after %s; backoff=%.2fs", exc, backoff)
+            time.sleep(backoff)
+            backoff = min(backoff * 2, _RETRY_MAX_S)
+    try:
+        yield conn
+    finally:
+        try:
+            conn.close()
+        except Exception:  # noqa: BLE001
+            pass
+
+
+def execute_retry(sql: str, params: Sequence[Any] | None = None) -> None:
+    """Execute a statement, retrying transient errors. No result returned."""
+    deadline = time.monotonic() + _RETRY_BUDGET_S
+    backoff = _RETRY_INITIAL_S
+    while True:
+        try:
+            with connect() as conn, conn.cursor() as cur:
+                cur.execute(sql, params or ())
+            return
+        except Exception as exc:  # noqa: BLE001
+            if not _retryable(exc) or time.monotonic() > deadline:
+                raise
+            LOG.info("pg execute retrying after %s", exc)
+            time.sleep(backoff)
+            backoff = min(backoff * 2, _RETRY_MAX_S)
+
+
+def query_retry(sql: str, params: Sequence[Any] | None = None) -> list[tuple[Any, ...]]:
+    """Run a query and return all rows, retrying transient errors."""
+    deadline = time.monotonic() + _RETRY_BUDGET_S
+    backoff = _RETRY_INITIAL_S
+    while True:
+        try:
+            with connect() as conn, conn.cursor() as cur:
+                cur.execute(sql, params or ())
+                return list(cur.fetchall())
+        except Exception as exc:  # noqa: BLE001
+            if not _retryable(exc) or time.monotonic() > deadline:
+                raise
+            LOG.info("pg query retrying after %s", exc)
+            time.sleep(backoff)
+            backoff = min(backoff * 2, _RETRY_MAX_S)
+
+
+def query_one_retry(
+    sql: str, params: Sequence[Any] | None = None
+) -> tuple[Any, ...] | None:
+    rows = query_retry(sql, params)
+    return rows[0] if rows else None
diff --git a/test/antithesis/workload/test/helper_quiet.py b/test/antithesis/workload/test/helper_quiet.py
new file mode 100644
index 0000000000000..adb4f9ead3e6d
--- /dev/null
+++ b/test/antithesis/workload/test/helper_quiet.py
@@ -0,0 +1,38 @@
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Wrapper around the Antithesis ANTITHESIS_STOP_FAULTS binary.
+
+Outside Antithesis (e.g. snouty local validate), the env var is unset and this
+becomes a no-op so the workload still runs end-to-end.
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+import subprocess
+
+LOG = logging.getLogger("antithesis.helper_quiet")
+
+
+def request_quiet_period(seconds: int) -> bool:
+    """Request that Antithesis pause all faults for `seconds`.
+
+    Returns True if the request was issued, False if not in Antithesis. Either
+    way callers must still poll for the system to stabilize — the binary
+    returns immediately and the actual quiet window unfolds asynchronously.
+    """
+    binary = os.environ.get("ANTITHESIS_STOP_FAULTS")
+    if not binary:
+        LOG.info("ANTITHESIS_STOP_FAULTS not set; skipping quiet-period request")
+        return False
+    LOG.info("requesting %ds quiet period via %s", seconds, binary)
+    subprocess.run([binary, str(seconds)], check=False)
+    return True
diff --git a/test/antithesis/workload/test/helper_random.py b/test/antithesis/workload/test/helper_random.py
new file mode 100644
index 0000000000000..cb749227d6f17
--- /dev/null
+++ b/test/antithesis/workload/test/helper_random.py
@@ -0,0 +1,64 @@
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Deterministic randomness for Antithesis drivers.
+
+All driver randomness must go through the Antithesis SDK so timelines replay
+deterministically. Outside Antithesis we fall back to the stdlib `random` with a
+fixed-but-arbitrary seed per process so local runs are not flaky.
+"""
+
+from __future__ import annotations
+
+import os
+import random as _stdlib_random
+from collections.abc import Sequence
+from typing import TypeVar
+
+try:
+    from antithesis import random as _ar
+
+    _ANTITHESIS = True
+except ImportError:
+    _ANTITHESIS = False
+
+T = TypeVar("T")
+
+# A stable per-process seed so local snouty validate runs are deterministic
+# within one process but pick a different sequence per process invocation.
+_FALLBACK = _stdlib_random.Random(int.from_bytes(os.urandom(8), "little"))
+
+
+def random_u64() -> int:
+    if _ANTITHESIS:
+        return _ar.get_random()
+    return _FALLBACK.getrandbits(64)
+
+
+def random_choice(seq: Sequence[T]) -> T:
+    if not seq:
+        raise ValueError("random_choice on empty sequence")
+    if _ANTITHESIS:
+        return _ar.random_choice(list(seq))
+    return _FALLBACK.choice(seq)
+
+
+def random_int(low: int, high: int) -> int:
+    """Inclusive on both ends."""
+    if low > high:
+        raise ValueError("low > high")
+    span = high - low + 1
+    return low + (random_u64() % span)
+
+
+def random_bool(true_prob: float) -> bool:
+    if not 0.0 <= true_prob <= 1.0:
+        raise ValueError("true_prob out of range")
+    # Use 16 bits of entropy to avoid floating-point quirks under replay.
+    return (random_u64() & 0xFFFF) < int(true_prob * 0x10000)
diff --git a/test/antithesis/workload/test/helper_source_stats.py b/test/antithesis/workload/test/helper_source_stats.py
new file mode 100644
index 0000000000000..54af7f0e29866
--- /dev/null
+++ b/test/antithesis/workload/test/helper_source_stats.py
@@ -0,0 +1,86 @@
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Catchup polling against `mz_internal.mz_source_statistics`.
+
+Used by drivers to wait until a Kafka source has durably ingested at least
+some target offset (typically the maximum produced offset). All durations are
+budgeted; callers handle timeouts.
+"""
+
+from __future__ import annotations
+
+import logging
+import time
+
+from helper_pg import query_one_retry
+
+LOG = logging.getLogger("antithesis.helper_source_stats")
+
+
+def offset_committed(source_name: str) -> int | None:
+    """Return the maximum offset_committed for `source_name`, or None.
+
+    `mz_source_statistics.offset_committed` is the durably-ingested upstream
+    offset, aggregated across replicas in the view. Returns None if the
+    statistics row does not exist yet (very early in source lifetime) so
+    callers can distinguish "not initialized" from "still behind."
+    """
+    row = query_one_retry(
+        """
+        SELECT MAX(ss.offset_committed)::bigint
+        FROM mz_internal.mz_source_statistics ss
+        JOIN mz_sources s ON s.id = ss.id
+        WHERE s.name = %s
+        """,
+        (source_name,),
+    )
+    if row is None or row[0] is None:
+        return None
+    return int(row[0])
+
+
+def wait_for_catchup(
+    source_name: str,
+    target_offset: int,
+    timeout_s: float = 60.0,
+    poll_interval_s: float = 0.5,
+) -> bool:
+    """Wait until offset_committed for `source_name` reaches `target_offset`.
+
+    Returns True if catchup completed within `timeout_s`, False on timeout.
+    """
+    deadline = time.monotonic() + timeout_s
+    last_seen: int | None = None
+    while time.monotonic() < deadline:
+        observed = offset_committed(source_name)
+        if observed is not None and observed >= target_offset:
+            LOG.info(
+                "source %s caught up: observed=%d target=%d",
+                source_name,
+                observed,
+                target_offset,
+            )
+            return True
+        if observed != last_seen:
+            LOG.info(
+                "source %s waiting for catchup: observed=%s target=%d",
+                source_name,
+                observed,
+                target_offset,
+            )
+            last_seen = observed
+        time.sleep(poll_interval_s)
+    LOG.warning(
+        "source %s catchup timeout: observed=%s target=%d",
+        source_name,
+        last_seen,
+        target_offset,
+    )
+    return False
diff --git a/test/antithesis/workload/test/helper_upsert_source.py b/test/antithesis/workload/test/helper_upsert_source.py
new file mode 100644
index 0000000000000..59332b28d64e9
--- /dev/null
+++ b/test/antithesis/workload/test/helper_upsert_source.py
@@ -0,0 +1,54 @@
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Idempotent setup for the Antithesis UPSERT-envelope Kafka source.
+
+Used by all drivers that exercise UPSERT semantics. Topic is auto-created by
+the Kafka broker on first produce; the source/connection are created at most
+once across all drivers (CREATE ... IF NOT EXISTS).
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+
+from helper_pg import execute_retry
+
+LOG = logging.getLogger("antithesis.helper_upsert_source")
+
+KAFKA_BROKER = os.environ.get("KAFKA_BROKER", "kafka:9092")
+CLUSTER = os.environ.get("MZ_ANTITHESIS_CLUSTER", "antithesis_cluster")
+
+CONNECTION_NAME = "antithesis_kafka_conn"
+TOPIC_UPSERT_TEXT = "antithesis-upsert-text"
+SOURCE_UPSERT_TEXT = "upsert_text_src"
+
+
+def ensure_kafka_connection() -> None:
+    execute_retry(
+        f"CREATE CONNECTION IF NOT EXISTS {CONNECTION_NAME} "
+        f"TO KAFKA (BROKER '{KAFKA_BROKER}', SECURITY PROTOCOL = 'PLAINTEXT')"
+    )
+
+
+def ensure_upsert_text_source() -> None:
+    """Create the upsert-envelope source over a text key/value Kafka topic.
+
+    The resulting source has columns `key TEXT NOT NULL` and `text TEXT`.
+    """
+    ensure_kafka_connection()
+    execute_retry(
+        f"CREATE SOURCE IF NOT EXISTS {SOURCE_UPSERT_TEXT} "
+        f"IN CLUSTER {CLUSTER} "
+        f"FROM KAFKA CONNECTION {CONNECTION_NAME} (TOPIC '{TOPIC_UPSERT_TEXT}') "
+        f"KEY FORMAT TEXT VALUE FORMAT TEXT "
+        f"ENVELOPE UPSERT"
+    )
+    LOG.info("upsert source %s ready (topic=%s)", SOURCE_UPSERT_TEXT, TOPIC_UPSERT_TEXT)
diff --git a/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py b/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py
new file mode 100755
index 0000000000000..7aa54acb3192d
--- /dev/null
+++ b/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py
@@ -0,0 +1,211 @@
+#!/usr/bin/env python3
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Antithesis driver for property `upsert-key-reflects-latest-value`.
+
+For each key produced to a Kafka UPSERT-envelope source, after a quiet period
+that lets Materialize catch up, the source's row for that key must reflect the
+last value produced — or be absent if the last message was a tombstone.
+
+Each invocation:
+  1. Ensures the upsert source exists (idempotent CREATE ... IF NOT EXISTS).
+  2. Picks a per-invocation key prefix so concurrent driver instances don't
+     interfere with each other's expected-state model.
+  3. Produces a deterministic mix of upserts and tombstones, tracking the
+     local "what should the source say" model.
+  4. Requests an Antithesis quiet period and waits for offset_committed to
+     reach the highest produced offset.
+  5. For every tracked key, asserts that what's in the source matches the
+     local model. Live keys use one assertion message, tombstoned keys use
+     another, so triage can distinguish the two failure modes.
+
+This is a `parallel_driver_` — Antithesis runs many concurrent instances and
+each one assigns itself a fresh prefix from deterministic randomness, so
+multiple drivers exercise the source without colliding.
+"""
+
+from __future__ import annotations
+
+import logging
+import sys
+
+import helper_random
+from helper_kafka import make_producer
+from helper_pg import query_one_retry
+from helper_quiet import request_quiet_period
+from helper_source_stats import wait_for_catchup
+from helper_upsert_source import (
+    SOURCE_UPSERT_TEXT,
+    TOPIC_UPSERT_TEXT,
+    ensure_upsert_text_source,
+)
+
+from antithesis.assertions import always, sometimes
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
+)
+LOG = logging.getLogger("driver.upsert_latest_value")
+
+# Knobs. Kept small per-invocation because Antithesis launches the driver many
+# times; total coverage comes from re-invocations, not from one huge run.
+PRODUCES_PER_INVOCATION = 40
+DISTINCT_KEYS = 8  # small key space so we re-write the same key often
+DISTINCT_VALUES = 16
+TOMBSTONE_PROB = 0.15
+
+QUIET_PERIOD_S = 20
+CATCHUP_TIMEOUT_S = 60.0
+
+
+def _produce(producer, tracker, topic: str, key: str, value: str | None) -> None:
+    """Encode value=None as a Kafka tombstone (null payload)."""
+    payload = None if value is None else value.encode("utf-8")
+    producer.produce(
+        topic=topic,
+        key=key.encode("utf-8"),
+        value=payload,
+        on_delivery=tracker.callback,
+    )
+
+
+def _select_value_for_key(key: str) -> tuple[bool, str | None]:
+    """Return (found, value) for the single source row matching `key`.
+
+    Returns (False, None) when no row exists (the tombstone case for an
+    UPSERT source). Returns (True, value) when exactly one row exists.
+    Raises if more than one row exists — that would mean the source is
+    multi-rowed per key and violates the UPSERT contract itself, which is
+    out of scope for this property and should be caught by
+    `kafka-source-no-data-duplication`.
+    """
+    row = query_one_retry(
+        f"SELECT count(*)::bigint, max(text) FROM {SOURCE_UPSERT_TEXT} WHERE key = %s",
+        (key,),
+    )
+    if row is None:
+        return False, None
+    count, value = row
+    if count == 0:
+        return False, None
+    if count != 1:
+        raise RuntimeError(
+            f"upsert source has {count} rows for key {key!r}; this driver assumes "
+            "the per-key uniqueness property holds"
+        )
+    return True, value
+
+
+def main() -> int:
+    ensure_upsert_text_source()
+
+    # Per-invocation prefix isolates this driver's keys from other concurrent
+    # drivers and from previous invocations of this same driver.
+    prefix = f"p{helper_random.random_u64():016x}"
+    LOG.info("driver starting; prefix=%s", prefix)
+
+    producer, tracker = make_producer(client_id=f"antithesis-{prefix}")
+
+    # Local "what should the source say" model for this invocation's keys.
+    # Value of None means "the last message was a tombstone".
+    expected: dict[str, str | None] = {}
+
+    keys = [f"{prefix}-k{i}" for i in range(DISTINCT_KEYS)]
+    for _ in range(PRODUCES_PER_INVOCATION):
+        key = helper_random.random_choice(keys)
+        if helper_random.random_bool(TOMBSTONE_PROB):
+            _produce(producer, tracker, TOPIC_UPSERT_TEXT, key, None)
+            expected[key] = None
+        else:
+            value = f"v{helper_random.random_int(0, DISTINCT_VALUES - 1):04d}"
+            _produce(producer, tracker, TOPIC_UPSERT_TEXT, key, value)
+            expected[key] = value
+        producer.poll(0)
+
+    # Flush all pending deliveries. We poll callbacks while flushing so the
+    # tracker reflects the true max produced offset.
+    pending = producer.flush(timeout=30)
+    if pending > 0 or tracker.last_error is not None:
+        # Under sustained fault injection we cannot prove which of the just-
+        # produced messages Kafka actually accepted, so `expected` may name
+        # values the source never sees. Bail out before running safety
+        # assertions — fault-induced delivery loss is not what this property
+        # is testing. The catchup `sometimes()` is also skipped because we
+        # have no trustworthy target offset.
+        LOG.info(
+            "skipping assertions: producer.flush pending=%d last_error=%s",
+            pending,
+            tracker.last_error,
+        )
+        return 0
+
+    max_produced = tracker.topic_max_offset(TOPIC_UPSERT_TEXT)
+    if max_produced < 0:
+        LOG.info("no messages confirmed delivered this invocation; exiting cleanly")
+        return 0
+
+    # Now ask Antithesis to pause faults and wait for Materialize to catch up.
+    request_quiet_period(QUIET_PERIOD_S)
+    caught_up = wait_for_catchup(
+        SOURCE_UPSERT_TEXT, max_produced, timeout_s=CATCHUP_TIMEOUT_S
+    )
+
+    # Liveness signal: at least one invocation should reach catchup. If this
+    # never fires across an entire run, the safety assertions below would be
+    # vacuous and the run is uninteresting.
+    sometimes(
+        caught_up,
+        "upsert: source caught up to produced offsets after quiet period",
+        {"source": SOURCE_UPSERT_TEXT, "target_offset": max_produced},
+    )
+
+    if not caught_up:
+        # Don't run the per-key safety assertions on stale data — that would
+        # blame the property for a slow catchup that's a separate concern.
+        LOG.info("catchup did not complete in budget; skipping per-key assertions")
+        return 0
+
+    # Per-key safety assertions. Two distinct messages so triage reports tell
+    # us *which* invariant broke: a value mismatch or a tombstone resurrection.
+    for key, want in expected.items():
+        found, observed = _select_value_for_key(key)
+
+        if want is None:
+            # The last produced message for this key was a tombstone; the
+            # source must not contain a row for it.
+            always(
+                not found,
+                "upsert: tombstoned key has no row in source",
+                {
+                    "source": SOURCE_UPSERT_TEXT,
+                    "key": key,
+                    "observed_value": observed,
+                },
+            )
+        else:
+            # Live key: there must be exactly one row, with the latest value.
+            always(
+                found and observed == want,
+                "upsert: SELECT for key matches latest produced value",
+                {
+                    "source": SOURCE_UPSERT_TEXT,
+                    "key": key,
+                    "expected_value": want,
+                    "observed_present": found,
+                    "observed_value": observed,
+                },
+            )
+
+    LOG.info("driver done; asserted on %d keys", len(expected))
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From 7033cce66dd2787d9bce5e9fca0655846f08a9a3 Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Mon, 11 May 2026 17:23:54 -0400
Subject: [PATCH 19/65] src/storage: wrap kafka source + upsert panic sites
 with antithesis-sdk assertions

---
 Cargo.lock                                    | 47 ++++++++++
 Cargo.toml                                    |  1 +
 src/storage/Cargo.toml                        |  1 +
 src/storage/src/source/kafka.rs               | 36 +++++++-
 src/storage/src/source/reclock/compat.rs      | 10 +-
 src/storage/src/upsert.rs                     | 15 +++
 src/storage/src/upsert/types.rs               | 91 +++++++++++++++++--
 src/storage/src/upsert_continual_feedback.rs  | 11 +++
 .../src/upsert_continual_feedback_v2.rs       | 15 ++-
 .../scratchbook/property-catalog.md           |  4 +
 10 files changed, 221 insertions(+), 10 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 86851059fce5e..2f4eed40b37c2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -172,6 +172,22 @@ dependencies = [
  "windows-sys 0.59.0",
 ]
 
+[[package]]
+name = "antithesis_sdk"
+version = "0.2.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "18dbd97a5b6c21cc9176891cf715f7f0c273caf3959897f43b9bd1231939e675"
+dependencies = [
+ "libc",
+ "libloading",
+ "linkme",
+ "once_cell",
+ "rand 0.8.5",
+ "rustc_version_runtime",
+ "serde",
+ "serde_json",
+]
+
 [[package]]
 name = "anyhow"
 version = "1.0.102"
@@ -5120,6 +5136,26 @@ dependencies = [
  "linked-hash-map",
 ]
 
+[[package]]
+name = "linkme"
+version = "0.3.36"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e83272d46373fb8decca684579ac3e7c8f3d71d4cc3aa693df8759e260ae41cf"
+dependencies = [
+ "linkme-impl",
+]
+
+[[package]]
+name = "linkme-impl"
+version = "0.3.36"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32d59e20403c7d08fe62b4376edfe5c7fb2ef1e6b1465379686d0f21c8df444b"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
 [[package]]
 name = "linux-raw-sys"
 version = "0.4.15"
@@ -7942,6 +7978,7 @@ dependencies = [
 name = "mz-storage"
 version = "0.0.0"
 dependencies = [
+ "antithesis_sdk",
  "anyhow",
  "arrow",
  "arrow-ipc",
@@ -10661,6 +10698,16 @@ dependencies = [
  "semver",
 ]
 
+[[package]]
+name = "rustc_version_runtime"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2dd18cd2bae1820af0b6ad5e54f4a51d0f3fcc53b05f845675074efcc7af071d"
+dependencies = [
+ "rustc_version",
+ "semver",
+]
+
 [[package]]
 name = "rustix"
 version = "0.38.44"
diff --git a/Cargo.toml b/Cargo.toml
index 8ba97cb61b290..5d38ff3d8124b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -263,6 +263,7 @@ ahash = { version = "0.8.12", default-features = false }
 aho-corasick = "1.1.4"
 allocation-counter = "0"
 anyhow = "1.0.102"
+antithesis_sdk = "0.2.8"
 array-concat = "0.5.5"
 arrayvec = "0.7.6"
 arrow = { version = "57", default-features = false }
diff --git a/src/storage/Cargo.toml b/src/storage/Cargo.toml
index f96d9991511dc..2e7f4f4a37ab7 100644
--- a/src/storage/Cargo.toml
+++ b/src/storage/Cargo.toml
@@ -15,6 +15,7 @@ bench = false
 
 [dependencies]
 anyhow.workspace = true
+antithesis_sdk.workspace = true
 async-stream.workspace = true
 async-trait.workspace = true
 aws-credential-types.workspace = true
diff --git a/src/storage/src/source/kafka.rs b/src/storage/src/source/kafka.rs
index 60ab8b8928058..2f6e8d28f960e 100644
--- a/src/storage/src/source/kafka.rs
+++ b/src/storage/src/source/kafka.rs
@@ -14,6 +14,7 @@ use std::sync::Arc;
 use std::thread;
 use std::time::Duration;
 
+use antithesis_sdk::{assert_always, assert_unreachable};
 use anyhow::anyhow;
 use chrono::{DateTime, NaiveDateTime};
 use differential_dataflow::{AsCollection, Hashable};
@@ -52,6 +53,7 @@ use rdkafka::statistics::Statistics;
 use rdkafka::topic_partition_list::Offset;
 use rdkafka::{ClientContext, Message, TopicPartitionList};
 use serde::{Deserialize, Serialize};
+use serde_json::json;
 use timely::PartialOrder;
 use timely::container::CapacityContainerBuilder;
 use timely::dataflow::channels::pact::Pipeline;
@@ -273,7 +275,13 @@ fn render_reader<'scope>(
                 .iter()
                 .map(|(_name, kind)| kind.clone())
                 .collect::<Vec<_>>(),
-            _ => panic!("unexpected source export details: {:?}", details),
+            _ => {
+                assert_unreachable!(
+                    "kafka: unexpected source export details",
+                    &json!({"source_id": id.to_string()})
+                );
+                panic!("unexpected source export details: {:?}", details)
+            }
         };
 
         let statistics = config
@@ -888,6 +896,11 @@ fn render_reader<'scope>(
                     }
                 }
                 // We can now put them back
+                assert_always!(
+                    reader.partition_consumers.is_empty(),
+                    "kafka: partition_consumers not drained at shutdown",
+                    &json!({"remaining": reader.partition_consumers.len()})
+                );
                 assert!(reader.partition_consumers.is_empty());
                 reader.partition_consumers = consumers;
 
@@ -1139,6 +1152,20 @@ impl KafkaSourceReader {
 
         // Given the explicit consumer to partition assignment, we should never receive a message
         // for a partition for which we have no metadata
+        let partition_known = self
+            .last_offsets
+            .get(output_index)
+            .map(|m| m.contains_key(&partition))
+            .unwrap_or(false);
+        assert_always!(
+            partition_known,
+            "kafka: partition missing from last_offsets",
+            &json!({
+                "source_id": self.id.to_string(),
+                "partition": partition,
+                "output_index": output_index,
+            })
+        );
         assert!(
             self.last_offsets
                 .get(output_index)
@@ -1190,6 +1217,13 @@ fn construct_source_message(
 ) {
     let pid = msg.partition();
     let Ok(offset) = u64::try_from(msg.offset()) else {
+        assert_unreachable!(
+            "kafka: negative offset from non-error message",
+            &json!({
+                "partition": msg.partition(),
+                "raw_offset": msg.offset(),
+            })
+        );
         panic!(
             "got negative offset ({}) from otherwise non-error'd kafka message",
             msg.offset()
diff --git a/src/storage/src/source/reclock/compat.rs b/src/storage/src/source/reclock/compat.rs
index a260e2dfcf060..607bbc4c5e680 100644
--- a/src/storage/src/source/reclock/compat.rs
+++ b/src/storage/src/source/reclock/compat.rs
@@ -15,6 +15,7 @@ use std::rc::Rc;
 use std::sync::Arc;
 use std::time::Duration;
 
+use antithesis_sdk::assert_unreachable;
 use anyhow::Context;
 use differential_dataflow::lattice::Lattice;
 use fail::fail_point;
@@ -33,6 +34,7 @@ use mz_storage_client::util::remap_handle::{RemapHandle, RemapHandleReader};
 use mz_storage_types::StorageDiff;
 use mz_storage_types::controller::CollectionMetadata;
 use mz_storage_types::sources::{SourceData, SourceTimestamp};
+use serde_json::json;
 use timely::order::{PartialOrder, TotalOrder};
 use timely::progress::Timestamp;
 use timely::progress::frontier::Antichain;
@@ -303,7 +305,13 @@ where
                 *self.shared_write_frontier.borrow_mut() = new_upper;
                 return result;
             }
-            Err(invalid_use) => panic!("compare_and_append failed: {invalid_use}"),
+            Err(invalid_use) => {
+                assert_unreachable!(
+                    "reclock: compare_and_append InvalidUsage",
+                    &json!({"error": invalid_use.to_string()})
+                );
+                panic!("compare_and_append failed: {invalid_use}")
+            }
         }
     }
 
diff --git a/src/storage/src/upsert.rs b/src/storage/src/upsert.rs
index cdc583d76b119..5c8922de4c022 100644
--- a/src/storage/src/upsert.rs
+++ b/src/storage/src/upsert.rs
@@ -15,6 +15,7 @@ use std::hash::{Hash, Hasher};
 use std::path::PathBuf;
 use std::sync::Arc;
 
+use antithesis_sdk::{assert_always, assert_unreachable};
 use differential_dataflow::hashable::Hashable;
 use differential_dataflow::{AsCollection, VecCollection};
 use futures::StreamExt;
@@ -34,6 +35,7 @@ use mz_timely_util::builder_async::{
     PressOnDropButton,
 };
 use serde::{Deserialize, Serialize};
+use serde_json::json;
 use sha2::{Digest, Sha256};
 use timely::dataflow::channels::pact::Exchange;
 use timely::dataflow::operators::{Capability, InputCapability, Operator};
@@ -538,6 +540,11 @@ fn stage_input<T, FromTime>(
     }
 
     stash.extend(data.drain(..).map(|((key, value, order), time, diff)| {
+        assert_always!(
+            diff.is_positive(),
+            "upsert: input diff positive (classic)",
+            &json!({"diff": diff.into_inner()})
+        );
         assert!(diff.is_positive(), "invalid upsert input");
         (time, key, Reverse(order), value)
     }));
@@ -633,6 +640,10 @@ async fn drain_staged_input<S, T, FromTime, E>(
         let mut command_state = if let Entry::Occupied(command_state) = commands_state.entry(key) {
             command_state
         } else {
+            assert_unreachable!(
+                "upsert: key missing from commands_state (classic)",
+                &json!({"source_id": source_config.id.to_string()})
+            );
             panic!("key missing from commands_state");
         };
 
@@ -1028,5 +1039,9 @@ async fn process_upsert_state_error<T: Timestamp>(
     let update = HealthStatusUpdate::halting(e.context(context).to_string_with_causes(), None);
     health_output.give(health_cap, (None, update));
     std::future::pending::<()>().await;
+    assert_unreachable!(
+        "upsert: pending future returned (classic)",
+        &json!({"site": "process_upsert_state_error"})
+    );
     unreachable!("pending future never returns");
 }
diff --git a/src/storage/src/upsert/types.rs b/src/storage/src/upsert/types.rs
index 2bf8270aa2c95..57a4b85033563 100644
--- a/src/storage/src/upsert/types.rs
+++ b/src/storage/src/upsert/types.rs
@@ -88,11 +88,13 @@ use std::num::Wrapping;
 use std::sync::Arc;
 use std::time::Instant;
 
+use antithesis_sdk::{assert_always, assert_unreachable};
 use bincode::Options;
 use itertools::Itertools;
 use mz_ore::error::ErrorExt;
 use mz_repr::{Diff, GlobalId};
 use serde::{Serialize, de::DeserializeOwned};
+use serde_json::json;
 
 use crate::metrics::upsert::{UpsertMetrics, UpsertSharedMetrics};
 use crate::statistics::SourceStatistics;
@@ -294,6 +296,10 @@ impl<T, O> StateValue<T, O> {
         match self {
             Self::Value(value) => value,
             Self::Consolidating(_) => {
+                assert_unreachable!(
+                    "upsert: into_decoded on Consolidating StateValue",
+                    &json!({"accessor": "into_decoded"})
+                );
                 panic!("called `into_decoded without calling `ensure_decoded`")
             }
         }
@@ -366,6 +372,10 @@ impl<T: Eq, O> StateValue<T, O> {
                 }),
             }),
             StateValue::Consolidating(_) => {
+                assert_unreachable!(
+                    "upsert: into_provisional_value on Consolidating StateValue",
+                    &json!({"accessor": "into_provisional_value"})
+                );
                 panic!("called `into_provisional_value` without calling `ensure_decoded`")
             }
         }
@@ -400,6 +410,10 @@ impl<T: Eq, O> StateValue<T, O> {
                 }),
             }),
             StateValue::Consolidating(_) => {
+                assert_unreachable!(
+                    "upsert: into_provisional_tombstone on Consolidating StateValue",
+                    &json!({"accessor": "into_provisional_tombstone"})
+                );
                 panic!("called `into_provisional_tombstone` without calling `ensure_decoded`")
             }
         }
@@ -413,6 +427,10 @@ impl<T: Eq, O> StateValue<T, O> {
                 _ => None,
             },
             Self::Consolidating(_) => {
+                assert_unreachable!(
+                    "upsert: provisional_order on Consolidating StateValue",
+                    &json!({"accessor": "provisional_order"})
+                );
                 panic!("called `provisional_order` without calling `ensure_decoded`")
             }
         }
@@ -427,6 +445,10 @@ impl<T: Eq, O> StateValue<T, O> {
                 _ => value.finalized.as_ref(),
             },
             Self::Consolidating(_) => {
+                assert_unreachable!(
+                    "upsert: provisional_value_ref on Consolidating StateValue",
+                    &json!({"accessor": "provisional_value_ref"})
+                );
                 panic!("called `provisional_value_ref` without calling `ensure_decoded`")
             }
         }
@@ -437,6 +459,10 @@ impl<T: Eq, O> StateValue<T, O> {
         match self {
             Self::Value(v) => v.finalized,
             Self::Consolidating(_) => {
+                assert_unreachable!(
+                    "upsert: into_finalized_value on Consolidating StateValue",
+                    &json!({"accessor": "into_finalized_value"})
+                );
                 panic!("called `into_finalized_value` without calling `ensure_decoded`")
             }
         }
@@ -577,7 +603,13 @@ impl<T: Eq, O> StateValue<T, O> {
                     *acc ^= val;
                 }
             }
-            _ => panic!("`merge_update_state` called with non-consolidating state"),
+            _ => {
+                assert_unreachable!(
+                    "upsert: merge_update_state on non-Consolidating state",
+                    &json!({"site": "merge_update_state"})
+                );
+                panic!("`merge_update_state` called with non-consolidating state")
+            }
         }
     }
 
@@ -618,29 +650,61 @@ impl<T: Eq, O> StateValue<T, O> {
                             })
                             .expect("invalid upsert state");
                         // Truncation is fine (using `as`) as this is just a checksum
+                        let want_checksum = seahash::hash(value) as i64;
+                        assert_always!(
+                            consolidating.checksum_sum.0 == want_checksum,
+                            "upsert: consolidating checksum_sum mismatch (diff_sum=1)",
+                            &json!({
+                                "source_id": source_id.to_string(),
+                                "checksum_sum": consolidating.checksum_sum.0,
+                                "expected_seahash": want_checksum,
+                            })
+                        );
                         assert_eq!(
-                            consolidating.checksum_sum.0,
-                            // Hash the value, not the full buffer, which may have extra 0's
-                            seahash::hash(value) as i64,
+                            consolidating.checksum_sum.0, want_checksum,
                             "invalid upsert state: checksum_sum does not match, state: {}, {}",
-                            consolidating,
-                            source_id,
+                            consolidating, source_id,
                         );
                         *self = Self::finalized_value(bincode_opts.deserialize(value).unwrap());
                     }
                     0 => {
+                        assert_always!(
+                            consolidating.len_sum.0 == 0,
+                            "upsert: consolidating len_sum nonzero (diff_sum=0)",
+                            &json!({
+                                "source_id": source_id.to_string(),
+                                "len_sum": consolidating.len_sum.0,
+                            })
+                        );
                         assert_eq!(
                             consolidating.len_sum.0, 0,
                             "invalid upsert state: len_sum is non-0, state: {}, {}",
                             consolidating, source_id,
                         );
+                        assert_always!(
+                            consolidating.checksum_sum.0 == 0,
+                            "upsert: consolidating checksum_sum nonzero (diff_sum=0)",
+                            &json!({
+                                "source_id": source_id.to_string(),
+                                "checksum_sum": consolidating.checksum_sum.0,
+                            })
+                        );
                         assert_eq!(
                             consolidating.checksum_sum.0, 0,
                             "invalid upsert state: checksum_sum is non-0, state: {}, {}",
                             consolidating, source_id,
                         );
+                        let all_zero = consolidating.value_xor.iter().all(|&x| x == 0);
+                        assert_always!(
+                            all_zero,
+                            "upsert: consolidating value_xor nonzero (diff_sum=0)",
+                            &json!({
+                                "source_id": source_id.to_string(),
+                                "value_xor_len": consolidating.value_xor.len(),
+                            })
+                        );
                         assert!(
-                            consolidating.value_xor.iter().all(|&x| x == 0),
+                            all_zero,
                             "invalid upsert state: value_xor not all 0s with 0 diff. \
                             Non-zero positions: {:?}, state: {}, {}",
                             consolidating
@@ -669,6 +733,15 @@ impl<T: Eq, O> StateValue<T, O> {
                                 ),
                                 Err(_) => "Err(UpsertValueError)".to_string(),
                             });
+                        assert_unreachable!(
+                            "upsert: consolidating diff_sum not in {0,1}",
+                            &json!({
+                                "source_id": source_id.to_string(),
+                                "diff_sum": other,
+                                "value_byte_len": value_byte_len,
+                                "decodable": decode_ok,
+                            })
+                        );
                         panic!(
                             "invalid upsert state: non 0/1 diff_sum: {}, state: {}, {}, \
                             key: {:?}, value_byte_len: {:?}, decodable: {:?}",
@@ -1059,6 +1132,10 @@ where
         });
 
         if completed && self.snapshot_completed {
+            assert_unreachable!(
+                "upsert: snapshot completion called twice",
+                &json!({"site": "consolidate_chunk"})
+            );
             panic!("attempted completion of already completed upsert snapshot")
         }
 
diff --git a/src/storage/src/upsert_continual_feedback.rs b/src/storage/src/upsert_continual_feedback.rs
index a4669d3a80099..5fb562a7aa08a 100644
--- a/src/storage/src/upsert_continual_feedback.rs
+++ b/src/storage/src/upsert_continual_feedback.rs
@@ -14,6 +14,7 @@ use std::cmp::Reverse;
 use std::fmt::Debug;
 use std::sync::Arc;
 
+use antithesis_sdk::{assert_always, assert_unreachable};
 use differential_dataflow::hashable::Hashable;
 use differential_dataflow::{AsCollection, VecCollection};
 use indexmap::map::Entry;
@@ -23,6 +24,7 @@ use mz_storage_types::errors::{DataflowError, EnvelopeError};
 use mz_timely_util::builder_async::{
     Event as AsyncEvent, OperatorBuilder as AsyncOperatorBuilder, PressOnDropButton,
 };
+use serde_json::json;
 use std::convert::Infallible;
 use timely::container::CapacityContainerBuilder;
 use timely::dataflow::StreamVec;
@@ -623,6 +625,11 @@ fn stage_input<T, FromTime>(
     }
 
     stash.extend(data.drain(..).map(|((key, value, order), time, diff)| {
+        assert_always!(
+            diff.is_positive(),
+            "upsert: input diff positive (cf v1)",
+            &json!({"diff": diff.into_inner()})
+        );
         assert!(diff.is_positive(), "invalid upsert input");
         (time, key, Reverse(order), value)
     }));
@@ -797,6 +804,10 @@ where
         let mut command_state = if let Entry::Occupied(command_state) = commands_state.entry(key) {
             command_state
         } else {
+            assert_unreachable!(
+                "upsert: key missing from commands_state (cf v1)",
+                &json!({"source_id": source_config.id.to_string()})
+            );
             panic!("key missing from commands_state");
         };
 
diff --git a/src/storage/src/upsert_continual_feedback_v2.rs b/src/storage/src/upsert_continual_feedback_v2.rs
index 32de9e3770086..8560ffd614603 100644
--- a/src/storage/src/upsert_continual_feedback_v2.rs
+++ b/src/storage/src/upsert_continual_feedback_v2.rs
@@ -65,6 +65,7 @@ use std::fmt::Debug;
 use std::marker::PhantomData;
 use std::sync::Arc;
 
+use antithesis_sdk::{assert_always, assert_unreachable};
 use differential_dataflow::difference::{IsZero, Semigroup};
 use differential_dataflow::hashable::Hashable;
 use differential_dataflow::lattice::Lattice;
@@ -81,6 +82,7 @@ use mz_storage_types::errors::{DataflowError, EnvelopeError};
 use mz_timely_util::builder_async::{
     Event as AsyncEvent, OperatorBuilder as AsyncOperatorBuilder, PressOnDropButton,
 };
+use serde_json::json;
 use std::convert::Infallible;
 use timely::container::CapacityContainerBuilder;
 use timely::dataflow::StreamVec;
@@ -312,6 +314,11 @@ where
                     AsyncEvent::Data(cap, data) => {
                         let mut pushed_any = false;
                         for ((key, value, from_time), ts, diff) in data {
+                            assert_always!(
+                                diff.is_positive(),
+                                "upsert: input diff positive (cf v2)",
+                                &json!({"diff": diff.into_inner()})
+                            );
                             assert!(diff.is_positive(), "invalid upsert input");
                             if PartialOrder::less_equal(&input_upper, &resume_upper)
                                 && !resume_upper.less_equal(&ts)
@@ -480,7 +487,13 @@ where
                         (Some(a), Some(b)) => std::cmp::min(a, b).clone(),
                         (Some(a), None) => a.clone(),
                         (None, Some(b)) => b.clone(),
-                        (None, None) => unreachable!(),
+                        (None, None) => {
+                            assert_unreachable!(
+                                "upsert: cf v2 join produced (None, None)",
+                                &json!({"site": "min_ts join"})
+                            );
+                            unreachable!()
+                        }
                     };
                     cap.downgrade(&min_ts);
                 } else {
diff --git a/test/antithesis/scratchbook/property-catalog.md b/test/antithesis/scratchbook/property-catalog.md
index 0645f1e868414..40b390c85529c 100644
--- a/test/antithesis/scratchbook/property-catalog.md
+++ b/test/antithesis/scratchbook/property-catalog.md
@@ -326,6 +326,7 @@ Properties specific to the Kafka source ingestion pipeline: `KafkaSourceReader`
 |---|---|
 | **Type** | Reachability (Unreachable) |
 | **Priority** | P1 — these panics are explicit "should-never-happen" guards that bug history has hit |
+| **Status** | **Implemented (SUT-side)** — every targeted site in `src/storage/src/upsert.rs` (stash diff-positive, `commands_state` missing key, `process_upsert_state_error` pending-future guard), `upsert_continual_feedback.rs` (stash diff-positive, `commands_state` missing key), `upsert_continual_feedback_v2.rs` (input diff-positive, `(None, None)` join), and `upsert/types.rs` (`merge_update_state` non-Consolidating, double snapshot completion) gets a uniquely-messaged `assert_always!`/`assert_unreachable!` paired with the original `panic!`/`assert!`. Panics still terminate the process; Antithesis now also receives a reportable property failure with rich details. |
 | **Property** | The explicit panics and `assert!`s in the upsert operator never fire under any Antithesis-injected fault sequence. Specifically: `assert!(diff.is_positive(), "invalid upsert input")` (upsert.rs:541, upsert_continual_feedback.rs:626, v2:315); `panic!("key missing from commands_state")` (upsert.rs:636, upsert_continual_feedback.rs:800); `unreachable!()` for `(None, None)` in continual-feedback v2 (v2:483); the order-key panic that used to live in `drain_staged_input` (now a skip; commit f177db8286). |
 | **Invariant** | `Unreachable`: each of these sites is converted to an Antithesis `assert_unreachable!("…")` (or `assert_always!(false, …)`) so that any firing produces an explicit Antithesis property failure rather than a process crash. Distinct, unique message per site. |
 | **Antithesis Angle** | These are the high-signal SUT-side anchors. They catch the same family of bugs that historically reached production: order-key regression, missing dedup entry, retraction-on-input. Adding them costs almost nothing in the SUT and gives Antithesis precise replay anchors. |
@@ -337,6 +338,7 @@ Properties specific to the Kafka source ingestion pipeline: `KafkaSourceReader`
 |---|---|
 | **Type** | Safety |
 | **Priority** | P0 — directly guards upsert state-store data integrity; catches XOR/checksum corruption |
+| **Status** | **Implemented (SUT-side)** — `src/storage/src/upsert/types.rs`. Five `assert_always!` calls inside `ensure_decoded` covering the `diff_sum == 1` checksum match, the three `diff_sum == 0` zero-residue checks, and the `diff_sum ∉ {0,1}` impossible-state path. Each carries the consolidating state's diagnostic in `details`. |
 | **Property** | When the upsert state backend's `StateValue::ensure_decoded` finalizes a `Consolidating` cell into either a live `Value` or a `tombstone`, the consolidating accumulator is well-formed: `diff_sum ∈ {0, 1}`; if `diff_sum == 1` the recovered bytes match the recorded `len_sum` and `checksum_sum` (seahash of `value_xor[..len_sum]`); if `diff_sum == 0` then `len_sum == 0`, `checksum_sum == 0`, and every byte of `value_xor` is zero. |
 | **Invariant** | `Always`: the `panic!("invalid upsert state: non 0/1 diff_sum: …")` at `upsert/types.rs:672` becomes an `assert_always!(false, "upsert: non 0/1 diff_sum")` with a unique message. The intermediate `assert_eq!`s at :621, :632, :637 and the `assert!` at :642 are likewise upgraded to `assert_always!` so they report rather than crash. Each site gets a distinct, specific message. |
 | **Antithesis Angle** | The consolidating state collapses many `(diff, bytes)` updates per key into running `diff_sum`, `len_sum`, `checksum_sum`, and an XOR-merged `value_xor` blob. The invariant relies on (a) every retraction being paired with an identical insertion in the snapshot stream, and (b) the snapshot completion contract delivering exactly the durable state at the resume frontier. Antithesis explores: crash mid-snapshot-replay, RocksDB merge operator interleaved with multi_put, partial feedback delivery across restart, and (most subtly) duplicated retractions from multi-replica drain (commit 1accbe28b3). Any of these can break the XOR cancellation and trip a non-{0,1} diff_sum. |
@@ -348,6 +350,7 @@ Properties specific to the Kafka source ingestion pipeline: `KafkaSourceReader`
 |---|---|
 | **Type** | Reachability (Unreachable) |
 | **Priority** | P2 — type-state protocol invariant; high-signal as a replay anchor |
+| **Status** | **Implemented (SUT-side)** — `src/storage/src/upsert/types.rs`. Six `assert_unreachable!` calls, one per accessor (`into_decoded`, `into_provisional_value`, `into_provisional_tombstone`, `provisional_order`, `provisional_value_ref`, `into_finalized_value`), each with a distinct message naming the accessor. Original `panic!` preserved after the assertion. |
 | **Property** | Every accessor on `StateValue` that requires the cell to be in `Value` form is preceded by a call to `ensure_decoded` for that cell. The six accessor panics — `into_decoded` (297), `into_provisional_value` (369), `into_provisional_tombstone` (403), `provisional_order` (416), `provisional_value_ref` (430), `into_finalized_value` (440) — never fire. |
 | **Invariant** | `Unreachable`: each `panic!("called \`...\` without calling \`ensure_decoded\`")` site is converted to a distinct `assert_unreachable!("upsert: <accessor> on Consolidating")`. Six unique assertion messages, one per accessor, so an Antithesis report distinguishes which contract was violated. These are pure protocol-misuse guards — they cannot fire in valid execution. |
 | **Antithesis Angle** | These panics are most likely to fire after a code change to the upsert operator (e.g. a new code path that forgets `ensure_decoded` before reading `provisional_value`). Antithesis exercises every operator branch under fault injection; turning these into reachability assertions gives a cheap regression-detection net for future refactors of `upsert.rs` / `upsert_continual_feedback*.rs`. They are also useful replay anchors — if Antithesis ever does reach them, the bug is reproducible. |
@@ -359,6 +362,7 @@ Properties specific to the Kafka source ingestion pipeline: `KafkaSourceReader`
 |---|---|
 | **Type** | Reachability (Unreachable) |
 | **Priority** | P1 — direct regression target for topic-recreation and offset-handling bugs |
+| **Status** | **Implemented (SUT-side, production sites)** — `src/storage/src/source/kafka.rs` covers the four production panic/assert sites (`unexpected source export details`, `partition_consumers not drained at shutdown`, `partition missing from last_offsets`, `negative offset from non-error message`); `src/storage/src/source/reclock/compat.rs` covers `compare_and_append InvalidUsage`. The remaining `expect()` sites on resume-upper / statistics / offset arithmetic are deferred to a follow-up; they would be a wide mechanical conversion to soft assertions rather than reportable properties. |
 | **Property** | The explicit panics in `kafka.rs` never fire: `panic!("got negative offset (...)")` (kafka.rs:1193); `panic!("unexpected source export details: ...")` (kafka.rs:276); the `assert!(self.last_offsets[output][partition])` (kafka.rs:1142); plus the `expect()` sites on resume-upper / statistics / offset arithmetic. |
 | **Invariant** | `Unreachable`: each site converted to a unique `assert_unreachable!("kafka: <site>")`. The "negative offset" panic in particular is a known structural-invariant violation that has fired before. |
 | **Antithesis Angle** | Topic deletion + recreation, partition rebalancing, manual offset reset on the Kafka broker, clock jumps that interact with Kafka's internal offset arithmetic. Direct regression target for commit 99ad668af5 (capability downgrade on topic recreation). |

From 12f2c795344dbe2b581693df3601c9070e89f71f Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Mon, 11 May 2026 17:40:15 -0400
Subject: [PATCH 20/65] test/antithesis: implement kafka-source-no-data-loss +
 kafka-source-no-data-duplication

---
 .../kafka-source-no-data-duplication.md       |   9 +
 .../properties/kafka-source-no-data-loss.md   |  15 ++
 .../scratchbook/property-catalog.md           |   2 +
 .../workload/test/helper_none_source.py       |  53 +++++
 .../parallel_driver_kafka_none_envelope.py    | 208 ++++++++++++++++++
 5 files changed, 287 insertions(+)
 create mode 100644 test/antithesis/workload/test/helper_none_source.py
 create mode 100755 test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py

diff --git a/test/antithesis/scratchbook/properties/kafka-source-no-data-duplication.md b/test/antithesis/scratchbook/properties/kafka-source-no-data-duplication.md
index fba0e8348808f..21780e5d10211 100644
--- a/test/antithesis/scratchbook/properties/kafka-source-no-data-duplication.md
+++ b/test/antithesis/scratchbook/properties/kafka-source-no-data-duplication.md
@@ -39,6 +39,15 @@ Aggregates over the source double-count. Joins fan out. Downstream MVs become wr
 
 The runtime `assert!` in upsert.rs already aborts on negative input diffs — it just doesn't surface as an Antithesis property. Wrapping each callsite with `assert_always!` (per-site unique message) gives Antithesis the signal it needs without changing semantics outside Antithesis (the underlying `assert!` already aborts on violation).
 
+## Implementation status
+
+Implemented 2026-05-11 in two halves:
+
+- **NONE envelope, workload-side**: `test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py` runs `SELECT partition, "offset", COUNT(*) FROM none_text_src WHERE text LIKE prefix:% GROUP BY 1,2 HAVING COUNT(*) > 1` after each catchup and asserts the result is empty via `always("kafka source: no duplicate (partition, offset)", details)`. Up to five offending rows are carried in `details` for triage.
+- **UPSERT envelope, SUT-side**: the `assert_always!(diff.is_positive(), ...)` family added by `upsert-no-internal-panic` covers the "duplicate retraction on input" symptom directly inside the operator at the three call sites in `upsert.rs`, `upsert_continual_feedback.rs`, `upsert_continual_feedback_v2.rs`. The workload-side per-key dedup check is part of `upsert-key-reflects-latest-value`.
+
+Per-payload visibility (the inverse-pair `kafka-source-no-data-loss` check) shares the same driver — both run on the same produce + catchup cycle to maximize signal per invocation.
+
 ## Provenance
 
 Surfaced by: Data Integrity, Concurrency, Failure Recovery. Direct regression target for database-issues#9160.
diff --git a/test/antithesis/scratchbook/properties/kafka-source-no-data-loss.md b/test/antithesis/scratchbook/properties/kafka-source-no-data-loss.md
index 2a451a32d4312..e999c42b76083 100644
--- a/test/antithesis/scratchbook/properties/kafka-source-no-data-loss.md
+++ b/test/antithesis/scratchbook/properties/kafka-source-no-data-loss.md
@@ -37,6 +37,21 @@ The interesting window is mid-batch crash: a clusterd kill between the persist s
 
 None. No `assert_sometimes!` in the source path today (verified against `existing-assertions.md`). To implement: add an `assert_sometimes!` in the persist sink's `append_batches` after a successful append, plus a workload-side `assert_sometimes!` after the quiet-period catch-up check.
 
+## Implementation status
+
+Implemented 2026-05-11 (NONE envelope, workload-side) as `test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py`. The driver shares a flight with `kafka-source-no-data-duplication` because both check the same dataflow:
+
+| Message | Type | Fires when |
+|---------|------|------------|
+| `"kafka source caught up to produced offsets after quiet period (none envelope)"` | `sometimes` | Once per invocation after `wait_for_catchup`; the liveness anchor |
+| `"kafka source: every produced payload is visible exactly once"` | `always` | Per produced payload, after catchup; carries `payload`, `present`, `observed_count` in details |
+
+The UPSERT-envelope arm of this property is covered by `upsert-key-reflects-latest-value`.
+
+The SUT-side `assert_sometimes!(persist_sink_appended_batch, ...)` anchor in `append_batches` is **deferred** — it would tighten replay anchoring but the workload check above is already specific enough that triage can localize a failure without it.
+
+New helper: `helper_none_source.py` — idempotent `CREATE SOURCE ... FORMAT TEXT INCLUDE PARTITION, OFFSET ENVELOPE NONE`, reusing the shared `antithesis_kafka_conn` connection from `helper_upsert_source.py`.
+
 ## Provenance
 
 Surfaced by: Data Integrity, Failure Recovery, Product Context.
diff --git a/test/antithesis/scratchbook/property-catalog.md b/test/antithesis/scratchbook/property-catalog.md
index 40b390c85529c..d5a8ed8925e6e 100644
--- a/test/antithesis/scratchbook/property-catalog.md
+++ b/test/antithesis/scratchbook/property-catalog.md
@@ -226,6 +226,7 @@ Properties specific to the Kafka source ingestion pipeline: `KafkaSourceReader`
 |---|---|
 | **Type** | Liveness |
 | **Priority** | P0 — primary user-visible contract; "data is in Kafka but not in Materialize" is the worst possible streaming bug |
+| **Status** | **Implemented (workload-side, NONE envelope)** — `test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py`. Per-payload `always("kafka source: every produced payload is visible exactly once", …)` joined to a quiet-period catchup wait. UPSERT-envelope version is covered by `upsert-key-reflects-latest-value`. The SUT-side `assert_sometimes!(persist_sink_appended_batch)` anchor in `append_batches` is deferred. |
 | **Property** | After producing a message to a Kafka topic, the Materialize source over that topic eventually contains a row corresponding to that message (NONE envelope) or a row reflecting the latest value for that key (UPSERT envelope). |
 | **Invariant** | `Sometimes(all_produced_records_visible)`: at least once during a run, after a quiet period, the workload observes `COUNT(*) FROM source` >= number of produced records (NONE) or every produced (key, value) pair is reflected in the source state (UPSERT). Liveness, so `Sometimes` on the catch-up event. |
 | **Antithesis Angle** | Network partitions between Materialize and Kafka, clusterd kills mid-ingestion, persist write retries, and rebalances. The interesting timing is the *crash mid-batch* window: some offsets are in persist, some are not, and the resume frontier determines what we re-read. Antithesis explores whether the re-read covers exactly the missing offsets. |
@@ -237,6 +238,7 @@ Properties specific to the Kafka source ingestion pipeline: `KafkaSourceReader`
 |---|---|
 | **Type** | Safety |
 | **Priority** | P0 — silent duplication corrupts every aggregate downstream MV |
+| **Status** | **Implemented (workload-side, NONE envelope)** — `test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py`. `always("kafka source: no duplicate (partition, offset)", …)` over a `GROUP BY partition, "offset" HAVING COUNT(*) > 1` query scoped to the invocation's prefix; carries up to five offending rows in `details`. UPSERT-envelope version is covered indirectly by `upsert-key-reflects-latest-value` (per-key uniqueness assertion) and directly by the SUT-side `assert_always!(diff.is_positive(), …)` of `upsert-no-internal-panic`. |
 | **Property** | After settling, the NONE-envelope source contains at most one row per `(partition, offset)` tuple; the UPSERT-envelope source contains at most one row per key. |
 | **Invariant** | `Always`: `SELECT partition, "offset", COUNT(*) FROM source GROUP BY 1,2 HAVING COUNT(*) > 1` returns no rows for NONE; `SELECT key, COUNT(*) FROM source GROUP BY 1 HAVING COUNT(*) > 1` returns no rows for UPSERT. Checked on every assertion firing — must hold on every observation. |
 | **Antithesis Angle** | Reader crashes between persist-sink batch write and `compare_and_append`; rehydration re-reads offsets we already wrote. The protection lives in `last_offsets` filtering (kafka.rs:1158) but only for the *current* incarnation — across restart, idempotency depends on the persist sink and (for UPSERT) the feedback-driven snapshot. Antithesis explores crash/restart timing across batch boundaries. Direct regression target for upsert double-retraction bug (commit 1accbe28b3, database-issues#9160). |
diff --git a/test/antithesis/workload/test/helper_none_source.py b/test/antithesis/workload/test/helper_none_source.py
new file mode 100644
index 0000000000000..e9ecb358675c8
--- /dev/null
+++ b/test/antithesis/workload/test/helper_none_source.py
@@ -0,0 +1,53 @@
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Idempotent setup for the Antithesis NONE-envelope (append-only) Kafka source.
+
+Used by drivers that exercise the append-only contract. The source has columns
+`text TEXT, partition INTEGER, offset BIGINT` — `partition` and `offset` are
+the Kafka metadata projected via `INCLUDE PARTITION, OFFSET`, which give us
+the per-`(partition, offset)` uniqueness check called out in
+`kafka-source-no-data-duplication.md`.
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+
+from helper_pg import execute_retry
+from helper_upsert_source import ensure_kafka_connection
+
+LOG = logging.getLogger("antithesis.helper_none_source")
+
+CLUSTER = os.environ.get("MZ_ANTITHESIS_CLUSTER", "antithesis_cluster")
+
+TOPIC_NONE_TEXT = "antithesis-none-text"
+SOURCE_NONE_TEXT = "none_text_src"
+
+
+def ensure_none_text_source() -> None:
+    """Create the append-only source over a text-valued Kafka topic.
+
+    Resulting columns: `text TEXT NOT NULL, partition INTEGER, offset BIGINT`.
+    Reuses the shared `antithesis_kafka_conn` Kafka connection so multiple
+    drivers don't proliferate connections.
+    """
+    ensure_kafka_connection()
+    execute_retry(
+        f"CREATE SOURCE IF NOT EXISTS {SOURCE_NONE_TEXT} "
+        f"IN CLUSTER {CLUSTER} "
+        f"FROM KAFKA CONNECTION antithesis_kafka_conn (TOPIC '{TOPIC_NONE_TEXT}') "
+        f"FORMAT TEXT "
+        f"INCLUDE PARTITION, OFFSET "
+        f"ENVELOPE NONE"
+    )
+    LOG.info(
+        "none-envelope source %s ready (topic=%s)", SOURCE_NONE_TEXT, TOPIC_NONE_TEXT
+    )
diff --git a/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py b/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py
new file mode 100755
index 0000000000000..9c3c0e2461cbe
--- /dev/null
+++ b/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py
@@ -0,0 +1,208 @@
+#!/usr/bin/env python3
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Antithesis driver for the inverse-pair NONE-envelope properties:
+  - `kafka-source-no-data-loss` — every produced (partition, offset) is visible
+  - `kafka-source-no-data-duplication` — no (partition, offset) appears twice
+
+The two run on the same dataflow because they are the symmetric failure modes
+of the same contract: one says "no row gone missing," the other says "no row
+duplicated." Settling once and asserting both halves catches both bugs from
+the same produce pass.
+
+Each invocation:
+  1. Ensures the NONE-envelope source exists.
+  2. Picks a per-invocation prefix so concurrent driver instances scope to
+     disjoint payloads. Every produced message has a `<prefix>:` prefix so the
+     workload can filter the source down to its own rows when asserting.
+  3. Produces N distinct payloads, recording the broker-assigned `(partition,
+     offset)` for each via the delivery callback.
+  4. Requests an Antithesis quiet period and waits for `offset_committed`
+     to reach the highest produced offset.
+  5. Runs two `assert_always` checks:
+       - "kafka source: no duplicate (partition, offset)" — `GROUP BY 1, 2 HAVING COUNT(*) > 1` is empty
+       - "kafka source: every produced payload is visible exactly once" —
+         fires per produced payload; payload, presence, and observed count
+         go into `details` so triage can localize which payloads went missing
+         or duplicated
+  6. Records one `assert_sometimes` liveness anchor confirming the safety
+     checks ran against settled data.
+
+This is a `parallel_driver_` — many concurrent instances exercise the source
+without colliding because each invocation owns its prefix range.
+"""
+
+from __future__ import annotations
+
+import logging
+import sys
+
+import helper_random
+from helper_kafka import make_producer
+from helper_none_source import (
+    SOURCE_NONE_TEXT,
+    TOPIC_NONE_TEXT,
+    ensure_none_text_source,
+)
+from helper_pg import query_retry
+from helper_quiet import request_quiet_period
+from helper_source_stats import wait_for_catchup
+
+from antithesis.assertions import always, sometimes
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
+)
+LOG = logging.getLogger("driver.kafka_none_envelope")
+
+# Knobs. Tuned so each invocation is a small, self-contained unit of work
+# — Antithesis launches the driver many times and accumulates coverage
+# across invocations, not within one giant batch.
+PRODUCES_PER_INVOCATION = 50
+QUIET_PERIOD_S = 20
+CATCHUP_TIMEOUT_S = 60.0
+
+
+def main() -> int:
+    ensure_none_text_source()
+
+    prefix = f"p{helper_random.random_u64():016x}"
+    LOG.info("driver starting; prefix=%s", prefix)
+
+    producer, tracker = make_producer(client_id=f"antithesis-none-{prefix}")
+
+    # The set of payloads we attempted to produce. Each is unique to
+    # (prefix, index) so we can filter the source on `text LIKE prefix:%`
+    # and join payloads back to (partition, offset) without tracking them
+    # at produce time.
+    expected_payloads: set[str] = set()
+    for i in range(PRODUCES_PER_INVOCATION):
+        payload = f"{prefix}:{i:06d}"
+        producer.produce(
+            topic=TOPIC_NONE_TEXT,
+            value=payload.encode("utf-8"),
+            on_delivery=tracker.callback,
+        )
+        expected_payloads.add(payload)
+        producer.poll(0)
+
+    pending = producer.flush(timeout=30)
+    if pending > 0 or tracker.last_error is not None:
+        # Same fail-closed pattern as the upsert driver: under sustained
+        # fault injection we cannot prove which messages Kafka accepted, so
+        # the expected set may name payloads the source never saw. Bail
+        # before running safety assertions.
+        LOG.info(
+            "skipping assertions: producer.flush pending=%d last_error=%s",
+            pending,
+            tracker.last_error,
+        )
+        return 0
+
+    max_produced = tracker.topic_max_offset(TOPIC_NONE_TEXT)
+    if max_produced < 0:
+        LOG.info("no messages confirmed delivered this invocation; exiting cleanly")
+        return 0
+
+    # Each payload is unique to this invocation (prefix:NNNNNN), so the
+    # source query below joins payloads back to (partition, offset)
+    # assignments without us needing to track them at produce time.
+
+    request_quiet_period(QUIET_PERIOD_S)
+    caught_up = wait_for_catchup(
+        SOURCE_NONE_TEXT, max_produced, timeout_s=CATCHUP_TIMEOUT_S
+    )
+
+    sometimes(
+        caught_up,
+        "kafka source caught up to produced offsets after quiet period (none envelope)",
+        {"source": SOURCE_NONE_TEXT, "target_offset": max_produced},
+    )
+
+    if not caught_up:
+        LOG.info("catchup did not complete in budget; skipping per-payload assertions")
+        return 0
+
+    # ----- no-data-duplication -----
+    # `GROUP BY partition, "offset" HAVING COUNT(*) > 1` filtered to this
+    # invocation's payloads. The catalog's `kafka-source-no-data-duplication`
+    # property names this exact query shape.
+    dup_rows = query_retry(
+        f"""
+        SELECT partition, "offset", COUNT(*)::bigint
+        FROM {SOURCE_NONE_TEXT}
+        WHERE text LIKE %s
+        GROUP BY 1, 2
+        HAVING COUNT(*) > 1
+        """,
+        (f"{prefix}:%",),
+    )
+    always(
+        len(dup_rows) == 0,
+        "kafka source: no duplicate (partition, offset)",
+        {
+            "source": SOURCE_NONE_TEXT,
+            "prefix": prefix,
+            "dupe_count": len(dup_rows),
+            # Carry up to a handful of offending rows for triage.
+            "examples": [
+                {"partition": int(p), "offset": int(o), "count": int(c)}
+                for (p, o, c) in dup_rows[:5]
+            ],
+        },
+    )
+
+    # ----- no-data-loss -----
+    # Confirm every payload we produced is visible *exactly once*. We do this
+    # via a left-join: enumerate produced payloads, ask the source for each.
+    # An always-pass requires every produced payload to map to exactly one
+    # source row whose `text` matches.
+    #
+    # We batch all payloads into one query rather than one round-trip per
+    # payload, so the assertion fires once per payload but the SQL cost
+    # stays bounded.
+    rows = query_retry(
+        f"""
+        SELECT text, partition, "offset", COUNT(*)::bigint
+        FROM {SOURCE_NONE_TEXT}
+        WHERE text LIKE %s
+        GROUP BY 1, 2, 3
+        """,
+        (f"{prefix}:%",),
+    )
+    by_payload: dict[str, tuple[int, int, int]] = {}
+    for text, partition, offset, count in rows:
+        by_payload[text] = (int(partition), int(offset), int(count))
+
+    for payload in expected_payloads:
+        info = by_payload.get(payload)
+        present = info is not None
+        count = info[2] if info else 0
+        always(
+            present and count == 1,
+            "kafka source: every produced payload is visible exactly once",
+            {
+                "source": SOURCE_NONE_TEXT,
+                "prefix": prefix,
+                "payload": payload,
+                "present": present,
+                "observed_count": count,
+            },
+        )
+
+    LOG.info(
+        "driver done; asserted no-dupe + per-payload visibility on %d produced payloads",
+        len(expected_payloads),
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From fd6722e7c5385372f70f88276960774e15e7fce1 Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Mon, 11 May 2026 17:48:51 -0400
Subject: [PATCH 21/65] test/antithesis: implement frontier-monotonic,
 tombstone-removes-key, state-rehydrates-correctly

---
 .../kafka-source-frontier-monotonic.md        |  14 +
 .../upsert-state-rehydrates-correctly.md      |  17 ++
 .../upsert-tombstone-removes-key.md           |   9 +
 .../scratchbook/property-catalog.md           |   3 +
 .../test/anytime_kafka_frontier_monotonic.py  | 136 ++++++++++
 .../parallel_driver_upsert_latest_value.py    |  23 ++
 ...ngleton_driver_upsert_state_rehydration.py | 248 ++++++++++++++++++
 7 files changed, 450 insertions(+)
 create mode 100755 test/antithesis/workload/test/anytime_kafka_frontier_monotonic.py
 create mode 100755 test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py

diff --git a/test/antithesis/scratchbook/properties/kafka-source-frontier-monotonic.md b/test/antithesis/scratchbook/properties/kafka-source-frontier-monotonic.md
index 03f551e5cbd9f..b22aa8d0e6852 100644
--- a/test/antithesis/scratchbook/properties/kafka-source-frontier-monotonic.md
+++ b/test/antithesis/scratchbook/properties/kafka-source-frontier-monotonic.md
@@ -35,6 +35,20 @@ A: The retry loop does protect — but only if `sync()` is called *before* the l
 
 None. The persist-side `panic!("compare_and_append failed: …")` in `reclock/compat.rs:306` is informational, not a property. Wrap with `assert_unreachable!` for the genuinely-invalid case and add an `assert_always!` for the workload-observable monotonicity.
 
+## Implementation status
+
+Implemented 2026-05-11 (workload-side) as `test/antithesis/workload/test/anytime_kafka_frontier_monotonic.py`. The `anytime_` driver runs throughout the timeline alongside other drivers while faults are active. Each poll iteration:
+
+1. Lists every source in `SOURCES = ["upsert_text_src", "none_text_src"]` that currently exists in the catalog (so an early-timeline poll before sources are created doesn't fire false negatives).
+2. For each source, calls `helper_source_stats.offset_committed()` (a `MAX(offset_committed)` over `mz_internal.mz_source_statistics` joined to `mz_sources` by name).
+3. Compares against the previous observation for that source in `last_seen`. The assertion `always("kafka: source offset_committed non-monotonic", details)` fires only when both observations succeeded — partition/clusterd unavailable is expected under faults and not an assertion target.
+
+`details` carries `source`, `previous`, `observed`, and `regression` (`previous - observed`).
+
+The SUT-side `assert_always!` in `append_batches` and the `reclock/compat.rs` `compare_and_append` paths (commit `e3805ad790`'s and `505dc96aaa`'s code paths) are deferred — the workload signal is sufficient to catch any externally-visible regression. Add SUT instrumentation later if Antithesis surfaces failures that need internal localization.
+
+The complementary `offset-known-not-below-committed` property is similar shape and could be added to this same driver with minimal cost; that's deliberately deferred to keep this commit scoped to the user-requested three properties.
+
 ## Provenance
 
 Surfaced by: Data Integrity, Distributed Coordination. Direct regression target for commits `e3805ad790` and `505dc96aaa`.
diff --git a/test/antithesis/scratchbook/properties/upsert-state-rehydrates-correctly.md b/test/antithesis/scratchbook/properties/upsert-state-rehydrates-correctly.md
index 336deb408759b..287d967d02c47 100644
--- a/test/antithesis/scratchbook/properties/upsert-state-rehydrates-correctly.md
+++ b/test/antithesis/scratchbook/properties/upsert-state-rehydrates-correctly.md
@@ -41,6 +41,23 @@ Compounded by RocksDB merge operator behavior (commit `0d8d740b47`): if the merg
 
 None. Candidate SUT anchors: an `assert_sometimes!(upsert_snapshot_completed, "upsert: snapshot phase completed")` at the snapshot-completion call site, and `assert_always!(diff_sum_in_range, …)` mirroring the existing `panic!` in `ensure_decoded`.
 
+## Implementation status
+
+Implemented 2026-05-11 (workload-side) as `test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py`. The `singleton_driver_` runs exactly once per timeline and lives across multiple produce/settle/assert cycles, holding `expected_state` in process memory across cycles:
+
+| Message | Type | Fires when |
+|---------|------|------------|
+| `"upsert: rehydrated state matches local model (live key)"` | `always` | Per live key, per cycle, after catchup. Cross-cycle stability of `expected` is the rehydration check. |
+| `"upsert: rehydrated state matches local model (tombstoned key)"` | `always` | Per tombstoned key, per cycle, after catchup. |
+| `"upsert: rehydration driver ran 2+ assertion cycles"` | `sometimes` | Once per invocation; confirms the safety check ran against multiple settle cycles (not just one early cycle that masks rehydration). |
+| `"upsert: rehydration driver observed clusterd replica non-online"` | `sometimes` | Best-effort proxy: `mz_internal.mz_cluster_replica_statuses` showed an `antithesis_cluster` replica in a non-`online` status during the run. Not a guarantee that a restart happened, but a noisy yes-signal that something disturbed the cluster. |
+
+Knobs: `CYCLE_COUNT=8`, `PRODUCES_PER_CYCLE=30`, `DISTINCT_KEYS=6` (small enough that keys are revisited within and across cycles), `TOMBSTONE_PROB=0.20`, `QUIET_PERIOD_S=25`, `CATCHUP_TIMEOUT_S=120`, `INTER_CYCLE_SLEEP_S=2`.
+
+**Requires node-termination faults enabled** in the Antithesis tenant for the property to be exercised at full strength. Without restarts, the cross-cycle stability check still catches divergence from the operator processing a sequence of upserts/tombstones (i.e., it falls back to a slower version of `upsert-key-reflects-latest-value`).
+
+SUT-side anchors at the upsert snapshot-completion call sites are deferred and would tighten replay anchoring.
+
 ## Provenance
 
 Surfaced by: Failure Recovery, Data Integrity.
diff --git a/test/antithesis/scratchbook/properties/upsert-tombstone-removes-key.md b/test/antithesis/scratchbook/properties/upsert-tombstone-removes-key.md
index 74f5f13a7ba49..50ee185c746f1 100644
--- a/test/antithesis/scratchbook/properties/upsert-tombstone-removes-key.md
+++ b/test/antithesis/scratchbook/properties/upsert-tombstone-removes-key.md
@@ -33,6 +33,15 @@ A deleted row reappears after restart. Compliance and correctness hazard. The li
 
 None. Workload-side check. The `StateValue::tombstone` construction path and the `ensure_decoded` tombstone branch are the relevant code; adding `assert_sometimes!(tombstone_emitted, ...)` inside the tombstone-emit path gives a coverage signal.
 
+## Implementation status
+
+Implemented 2026-05-11 (workload-side) inside the existing `parallel_driver_upsert_latest_value.py`:
+
+- Safety half: `always("upsert: tombstoned key has no row in source", ...)` (already existed for `upsert-key-reflects-latest-value`) — fires per key whose latest produced message was a tombstone.
+- Path-exercise anchor: new `sometimes("upsert: tombstone overwrote a live value at least once this invocation", ...)`. The driver counts `tombstoned_after_value` — the number of tombstone produces where the immediately-prior produced value for that key was a live value. Without this anchor, the `always` could be vacuously satisfied by tombstones against never-written keys.
+
+The "no resurrection across restart" half is covered structurally by `upsert-state-rehydrates-correctly`'s cross-cycle stability check, which includes tombstoned keys in its per-key assertion loop (`"upsert: rehydrated state matches local model (tombstoned key)"`).
+
 ## Provenance
 
 Surfaced by: Data Integrity, Lifecycle Transitions (delete operations).
diff --git a/test/antithesis/scratchbook/property-catalog.md b/test/antithesis/scratchbook/property-catalog.md
index d5a8ed8925e6e..9e94cdf8ed089 100644
--- a/test/antithesis/scratchbook/property-catalog.md
+++ b/test/antithesis/scratchbook/property-catalog.md
@@ -250,6 +250,7 @@ Properties specific to the Kafka source ingestion pipeline: `KafkaSourceReader`
 |---|---|
 | **Type** | Safety |
 | **Priority** | P1 — frontier regression panics downstream operators and breaks `AS OF` queries |
+| **Status** | **Implemented (workload-side)** — `test/antithesis/workload/test/anytime_kafka_frontier_monotonic.py`. Continuous `anytime_` driver polls `mz_internal.mz_source_statistics.offset_committed` for every known Kafka source every 500ms and asserts `always("kafka: source offset_committed non-monotonic", details)` whenever a new sample is less than the previous one. Faults are active throughout. SUT-side `assert_always!(new_upper >= prev_upper, ...)` in `append_batches` is deferred. |
 | **Property** | The `upper` frontier of the source's data persist shard never regresses across the lifetime of the source, including across clusterd restarts and `compare_and_append` retries. |
 | **Invariant** | `Always`: observed `upper(t2) >= upper(t1)` for any observation order `t1 < t2`. Checked on every observation in a workload polling loop, and ideally also as a SUT-side `assert_always!` next to the persist sink's `compare_and_append`. |
 | **Antithesis Angle** | Kill clusterd mid-`compare_and_append`; resume the source with a stale cached upper; concurrent reclock and persist-sink writers. Direct regression target for the `as_of`/reclock-upper race (commit e3805ad790, database-issues#8698) and the persist-sink cached upper bug (commit 505dc96aaa). |
@@ -295,6 +296,7 @@ Properties specific to the Kafka source ingestion pipeline: `KafkaSourceReader`
 |---|---|
 | **Type** | Safety |
 | **Priority** | P1 — delete semantics are routinely relied on for GDPR/correctness |
+| **Status** | **Implemented (workload-side)** — `test/antithesis/workload/test/parallel_driver_upsert_latest_value.py`. The existing `always("upsert: tombstoned key has no row in source", ...)` covers the safety half; a new `sometimes("upsert: tombstone overwrote a live value at least once this invocation", ...)` confirms the *interesting* tombstone path (tombstone replacing a live value) is exercised rather than the trivial "tombstone a never-written key" case. |
 | **Property** | After producing a `(key, null)` tombstone message to the Kafka topic, the UPSERT source eventually contains no row for that key, and the row stays absent until a new non-null value is produced. |
 | **Invariant** | `Always`: at any settled observation after the tombstone has been ingested (resume_upper > tombstone offset), `SELECT * FROM source WHERE key = ?` returns 0 rows. The "no resurrection" half is also `Always`: a key that has been tombstoned and not re-inserted must not reappear after a clusterd restart or rehydration cycle. |
 | **Antithesis Angle** | Race the tombstone against a state-store snapshot completion. Crash clusterd between persist sink writing the retraction and the upsert state recording the tombstone. The `StateValue::Value` -> tombstone path in `upsert/types.rs` is the relevant code; bugs here look like resurrected rows. |
@@ -306,6 +308,7 @@ Properties specific to the Kafka source ingestion pipeline: `KafkaSourceReader`
 |---|---|
 | **Type** | Safety |
 | **Priority** | P1 — incorrect rehydration produces wrong-but-plausible-looking output |
+| **Status** | **Implemented (workload-side)** — `test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py`. Long-running `singleton_driver_` runs N produce→settle→assert cycles holding `expected_state` in process memory. Cross-cycle stability is the rehydration check: if a clusterd restart lands between cycles, the next cycle's `always("upsert: rehydrated state matches local model (live key|tombstoned key)", ...)` verifies the rebuilt source matches the pre-restart model. Requires node-termination faults enabled. |
 | **Property** | After a clusterd restart, the rehydrated upsert state, as observed via `SELECT * FROM source`, equals the state at the most recent durable timestamp before the restart, for every key produced so far. |
 | **Invariant** | `Always`: after a kill+restart quiet period, the workload's local key/value model matches the source's contents for every key whose latest message has `offset <= resume_upper`. Combines with `kafka-source-no-data-duplication` (no double inserts on rehydration) and `upsert-key-reflects-latest-value` (correct value per key). |
 | **Antithesis Angle** | The interesting window is between `compare_and_append` of the persist sink and the upsert operator's feedback-driven snapshot completion. If the feedback replay deduplication is wrong, rehydrated state diverges from durable state. Direct regression target for the upsert snapshot-completion logic in `upsert/types.rs` and `upsert_continual_feedback*`. |
diff --git a/test/antithesis/workload/test/anytime_kafka_frontier_monotonic.py b/test/antithesis/workload/test/anytime_kafka_frontier_monotonic.py
new file mode 100755
index 0000000000000..faee0fd0c680e
--- /dev/null
+++ b/test/antithesis/workload/test/anytime_kafka_frontier_monotonic.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Antithesis driver for `kafka-source-frontier-monotonic`.
+
+The `upper` of a Kafka source's persist data shard must never regress across
+its lifetime, including across clusterd restarts and `compare_and_append`
+retries. Approximated via the workload-visible `offset_committed` reported
+in `mz_internal.mz_source_statistics`, which is the durably-ingested
+upstream offset for the source.
+
+This is an `anytime_` driver — it runs continuously throughout the timeline,
+polling all of this workload's Kafka sources and asserting that each one's
+`offset_committed` never decreases between successive observations. Faults
+are active while it runs, which is the right shape for a continuous safety
+invariant: Antithesis can crash clusterd between two of our polls and the
+next poll must still report a value >= the previous one.
+
+The driver exits after a bounded budget so Antithesis can re-launch it
+freely without one instance pinning resources. Cross-invocation: each
+instance reads the state from before-restart only via `offset_committed`
+itself (no in-process memory carries across) — `last_seen` is reset on each
+launch, but Antithesis runs many instances in parallel and the union of
+their observations covers the regression window.
+
+Errors during polling (network partitions, clusterd unavailable) are
+*expected* under fault injection and must not produce false-positive
+failures. We only assert when we have two successive successful reads for
+the same source.
+"""
+
+from __future__ import annotations
+
+import logging
+import sys
+import time
+
+from helper_pg import query_retry
+from helper_source_stats import offset_committed
+
+from antithesis.assertions import always
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
+)
+LOG = logging.getLogger("driver.kafka_frontier_monotonic")
+
+# Knobs.
+POLL_INTERVAL_S = 0.5
+RUN_BUDGET_S = 30.0
+
+# The Antithesis cluster every driver in this workload provisions sources into.
+# Discovering sources dynamically (rather than hardcoding names) means new
+# drivers that introduce new Kafka sources get monotonicity coverage for free.
+ANTITHESIS_CLUSTER = "antithesis_cluster"
+
+
+def _sources_present() -> list[str]:
+    """Return every Kafka source currently owned by `antithesis_cluster`."""
+    rows = query_retry(
+        """
+        SELECT s.name
+        FROM mz_sources s
+        JOIN mz_clusters c ON c.id = s.cluster_id
+        WHERE c.name = %s AND s.type = 'kafka'
+        """,
+        (ANTITHESIS_CLUSTER,),
+    )
+    return [r[0] for r in rows]
+
+
+def main() -> int:
+    deadline = time.monotonic() + RUN_BUDGET_S
+    # Per-source highest committed offset observed across this invocation's
+    # polls. Each successful new read for a source must be >= last_seen.
+    last_seen: dict[str, int] = {}
+    polled = 0
+
+    while time.monotonic() < deadline:
+        try:
+            sources = _sources_present()
+        except Exception as exc:  # noqa: BLE001
+            LOG.info("source list query failed: %s; sleeping and retrying", exc)
+            time.sleep(POLL_INTERVAL_S)
+            continue
+
+        for source in sources:
+            try:
+                observed = offset_committed(source)
+            except Exception as exc:  # noqa: BLE001
+                LOG.info("offset_committed query failed for %s: %s", source, exc)
+                continue
+            if observed is None:
+                # Statistics row not initialized yet (very early in source
+                # lifetime, or post-restart before stats first reported).
+                # Not an assertion target.
+                continue
+
+            prev = last_seen.get(source)
+            if prev is not None:
+                always(
+                    observed >= prev,
+                    "kafka: source offset_committed non-monotonic",
+                    {
+                        "source": source,
+                        "previous": prev,
+                        "observed": observed,
+                        "regression": prev - observed,
+                    },
+                )
+
+            # Always update last_seen, even on regression — we want to keep
+            # asserting against the most recent observation so a regression
+            # surfaces once per discrete drop, not on every subsequent poll.
+            last_seen[source] = observed
+            polled += 1
+
+        time.sleep(POLL_INTERVAL_S)
+
+    LOG.info(
+        "frontier monotonic check done; %d samples across %d sources",
+        polled,
+        len(last_seen),
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py b/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py
index 7aa54acb3192d..066620aaf6ded 100755
--- a/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py
+++ b/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py
@@ -117,10 +117,19 @@ def main() -> int:
     # Value of None means "the last message was a tombstone".
     expected: dict[str, str | None] = {}
 
+    # Count of times we tombstoned a key whose immediately-prior produced
+    # value was a live value (not absent, not already tombstoned). This is
+    # the exact `upsert-tombstone-removes-key` exercise pattern: the
+    # interesting case is "remove a row that was just there," not "tombstone
+    # a key we never wrote to."
+    tombstoned_after_value = 0
+
     keys = [f"{prefix}-k{i}" for i in range(DISTINCT_KEYS)]
     for _ in range(PRODUCES_PER_INVOCATION):
         key = helper_random.random_choice(keys)
         if helper_random.random_bool(TOMBSTONE_PROB):
+            if expected.get(key) is not None:
+                tombstoned_after_value += 1
             _produce(producer, tracker, TOPIC_UPSERT_TEXT, key, None)
             expected[key] = None
         else:
@@ -203,6 +212,20 @@ def main() -> int:
                 },
             )
 
+    # Liveness anchor for `upsert-tombstone-removes-key`: confirms the
+    # interesting tombstone path (tombstone replacing a live value) was
+    # exercised at least once during the run. Without this, the
+    # `always(not found, "upsert: tombstoned key has no row in source", ...)`
+    # check above might fire only against keys that were never live.
+    sometimes(
+        tombstoned_after_value > 0,
+        "upsert: tombstone overwrote a live value at least once this invocation",
+        {
+            "tombstoned_after_value": tombstoned_after_value,
+            "produces": PRODUCES_PER_INVOCATION,
+        },
+    )
+
     LOG.info("driver done; asserted on %d keys", len(expected))
     return 0
 
diff --git a/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py b/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py
new file mode 100755
index 0000000000000..5c41c406f3210
--- /dev/null
+++ b/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py
@@ -0,0 +1,248 @@
+#!/usr/bin/env python3
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Antithesis driver for `upsert-state-rehydrates-correctly`.
+
+After a clusterd restart, the rehydrated upsert state — observed via
+`SELECT * FROM source` — must equal the state at the most recent durable
+timestamp before the restart, for every key produced so far.
+
+Implementation strategy: a `singleton_driver_` runs exactly once per
+timeline and lives long enough to span multiple produce/settle/assert
+cycles. Local memory holds the authoritative "what the source should say"
+model across cycles. If Antithesis kills clusterd between two cycles, the
+next cycle's `SELECT` is effectively a rehydration check — and because the
+local model is unchanged across the restart, any divergence in the source
+output is exactly the property's failure mode.
+
+Each cycle:
+  1. Produce a batch of (key, value) and (key, null) messages, updating the
+     in-memory `expected_state` model.
+  2. Request a quiet period and wait for `offset_committed` to reach the
+     highest produced offset.
+  3. SELECT every tracked key's current source state and assert it matches
+     `expected_state` via `always("upsert: rehydrated state equals
+     local model", ...)`. Across-cycle stability is exactly what
+     rehydration correctness is.
+
+The driver also records one `sometimes` anchor confirming that at least
+two assertion-bearing cycles ran (without this, the safety check could be
+vacuously satisfied by a single early settle), and a second anchor
+confirming clusterd was observed unavailable between cycles (best-effort
+proxy for "restart happened" — the helper_pg retry budget makes connect
+errors very rare under normal operation).
+
+Distinct prefix per timeline keeps multiple parallel timelines independent.
+"""
+
+from __future__ import annotations
+
+import logging
+import sys
+import time
+
+import helper_random
+from helper_kafka import make_producer
+from helper_pg import query_one_retry
+from helper_quiet import request_quiet_period
+from helper_source_stats import wait_for_catchup
+from helper_upsert_source import (
+    SOURCE_UPSERT_TEXT,
+    TOPIC_UPSERT_TEXT,
+    ensure_upsert_text_source,
+)
+
+from antithesis.assertions import always, sometimes
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
+)
+LOG = logging.getLogger("driver.upsert_state_rehydration")
+
+# Long-running knobs — this driver owns its timeline alongside parallel
+# drivers, so the per-cycle budget is generous and the cycle count high
+# enough that a node-termination fault has a chance to land between cycles.
+CYCLE_COUNT = 8
+PRODUCES_PER_CYCLE = 30
+DISTINCT_KEYS = 6
+DISTINCT_VALUES = 12
+TOMBSTONE_PROB = 0.20
+
+QUIET_PERIOD_S = 25
+CATCHUP_TIMEOUT_S = 120.0
+INTER_CYCLE_SLEEP_S = 2.0
+
+
+def _select_value_for_key(key: str) -> tuple[bool, str | None]:
+    """Duplicate of `_select_value_for_key` in `parallel_driver_upsert_latest_value.py`.
+    Kept inline to avoid expanding helper surface for one shared private function."""
+    row = query_one_retry(
+        f"SELECT count(*)::bigint, max(text) FROM {SOURCE_UPSERT_TEXT} WHERE key = %s",
+        (key,),
+    )
+    if row is None:
+        return False, None
+    count, value = row
+    if count == 0:
+        return False, None
+    if count != 1:
+        raise RuntimeError(
+            f"upsert source has {count} rows for key {key!r}; this driver "
+            "assumes the per-key uniqueness property holds (see "
+            "`upsert-key-reflects-latest-value` and "
+            "`kafka-source-no-data-duplication`)"
+        )
+    return True, value
+
+
+def _saw_clusterd_unavailable() -> bool:
+    """Best-effort probe: does `mz_internal.mz_cluster_replica_statuses` show
+    any `antithesis_cluster` replica with `status != 'online'` right now?
+    The status column reports `online` or `offline`. Catching `offline`
+    in a snapshot doesn't *prove* a restart happened (we may have missed
+    a transient flap entirely), but it's a noisy yes-signal that something
+    disturbed the cluster during the cycle.
+    """
+    try:
+        row = query_one_retry("""
+            SELECT EXISTS (
+                SELECT 1
+                FROM mz_internal.mz_cluster_replica_statuses s
+                JOIN mz_cluster_replicas r ON r.id = s.replica_id
+                JOIN mz_clusters c ON c.id = r.cluster_id
+                WHERE c.name = 'antithesis_cluster' AND s.status != 'online'
+            )
+            """)
+    except Exception:  # noqa: BLE001
+        return False
+    return bool(row and row[0])
+
+
+def _run_cycle(
+    producer, tracker, expected: dict[str, str | None], cycle_idx: int
+) -> bool:
+    """Produce one batch, settle, and assert state for every tracked key.
+
+    Returns True if assertions ran (cycle settled), False if we bailed early.
+    """
+    keys = [f"reh-k{i}" for i in range(DISTINCT_KEYS)]
+    for _ in range(PRODUCES_PER_CYCLE):
+        key = helper_random.random_choice(keys)
+        if helper_random.random_bool(TOMBSTONE_PROB):
+            producer.produce(
+                topic=TOPIC_UPSERT_TEXT,
+                key=key.encode("utf-8"),
+                value=None,
+                on_delivery=tracker.callback,
+            )
+            expected[key] = None
+        else:
+            value = f"reh-v{cycle_idx:02d}-{helper_random.random_int(0, DISTINCT_VALUES - 1):04d}"
+            producer.produce(
+                topic=TOPIC_UPSERT_TEXT,
+                key=key.encode("utf-8"),
+                value=value.encode("utf-8"),
+                on_delivery=tracker.callback,
+            )
+            expected[key] = value
+        producer.poll(0)
+
+    pending = producer.flush(timeout=30)
+    if pending > 0 or tracker.last_error is not None:
+        LOG.info(
+            "cycle %d: skipping assertions; flush pending=%d last_error=%s",
+            cycle_idx,
+            pending,
+            tracker.last_error,
+        )
+        return False
+
+    max_produced = tracker.topic_max_offset(TOPIC_UPSERT_TEXT)
+    if max_produced < 0:
+        LOG.info("cycle %d: no messages confirmed delivered; skipping", cycle_idx)
+        return False
+
+    request_quiet_period(QUIET_PERIOD_S)
+    caught_up = wait_for_catchup(
+        SOURCE_UPSERT_TEXT, max_produced, timeout_s=CATCHUP_TIMEOUT_S
+    )
+    if not caught_up:
+        LOG.info(
+            "cycle %d: catchup did not complete in budget; skipping asserts", cycle_idx
+        )
+        return False
+
+    # Per-key assertion. The cross-cycle stability of `expected` is what
+    # makes this a rehydration check: if a clusterd restart happened
+    # between this cycle and the previous, the source has been rebuilt
+    # from feedback and must agree with `expected` again.
+    for key, want in expected.items():
+        found, observed = _select_value_for_key(key)
+        if want is None:
+            always(
+                not found,
+                "upsert: rehydrated state matches local model (tombstoned key)",
+                {
+                    "source": SOURCE_UPSERT_TEXT,
+                    "key": key,
+                    "cycle": cycle_idx,
+                    "observed_value": observed,
+                },
+            )
+        else:
+            always(
+                found and observed == want,
+                "upsert: rehydrated state matches local model (live key)",
+                {
+                    "source": SOURCE_UPSERT_TEXT,
+                    "key": key,
+                    "cycle": cycle_idx,
+                    "expected_value": want,
+                    "observed_present": found,
+                    "observed_value": observed,
+                },
+            )
+    return True
+
+
+def main() -> int:
+    ensure_upsert_text_source()
+    LOG.info("rehydration driver starting; %d cycles planned", CYCLE_COUNT)
+
+    producer, tracker = make_producer(client_id="antithesis-rehydration")
+    expected: dict[str, str | None] = {}
+
+    cycles_run = 0
+    saw_replica_unavailable = False
+
+    for cycle_idx in range(CYCLE_COUNT):
+        if _run_cycle(producer, tracker, expected, cycle_idx):
+            cycles_run += 1
+        if _saw_clusterd_unavailable():
+            saw_replica_unavailable = True
+        time.sleep(INTER_CYCLE_SLEEP_S)
+
+    sometimes(
+        cycles_run >= 2,
+        "upsert: rehydration driver ran 2+ assertion cycles",
+        {"cycles_run": cycles_run, "cycles_planned": CYCLE_COUNT},
+    )
+    sometimes(
+        saw_replica_unavailable,
+        "upsert: rehydration driver observed clusterd replica non-online",
+        {"cycles_run": cycles_run},
+    )
+
+    LOG.info("rehydration driver done; %d/%d cycles ran", cycles_run, CYCLE_COUNT)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From bb02873d824a0f83d5878186f97ec18fde7ccb1b Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Mon, 11 May 2026 18:24:56 -0400
Subject: [PATCH 22/65] ci: scope CI_ANTITHESIS build to materialized +
 antithesis-{workload,config} + transitive deps

---
 ci/test/build.py                   | 20 +++++++++++++++++++-
 test/antithesis/push-antithesis.py |  3 +++
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/ci/test/build.py b/ci/test/build.py
index 5b18ce91e9b31..95f4227afbaa7 100755
--- a/ci/test/build.py
+++ b/ci/test/build.py
@@ -47,7 +47,25 @@ def main() -> None:
         # Build and push any images that are not already available on Docker Hub,
         # so they are accessible to other build agents.
         print("--- Acquiring mzbuild images")
-        deps = repo.resolve_dependencies(image for image in repo if image.publish)
+        if antithesis:
+            # Antithesis only consumes these three images; everything else in
+            # the repo (balancerd, sqllogictest, testdrive, ...) is wasted CI
+            # time for this pipeline. resolve_dependencies walks depends_on
+            # transitively, so anything materialized actually needs still
+            # comes along. Keep this list in sync with ANTITHESIS_IMAGES in
+            # test/antithesis/push-antithesis.py.
+            antithesis_images = [
+                "materialized",
+                "antithesis-workload",
+                "antithesis-config",
+            ]
+            deps = repo.resolve_dependencies(
+                repo.images[name] for name in antithesis_images
+            )
+        else:
+            deps = repo.resolve_dependencies(
+                image for image in repo if image.publish
+            )
         deps.ensure(pre_build=lambda images: upload_debuginfo(repo, images))
         set_build_status("success")
         annotate_buildkite_with_tags(repo.rd.arch, deps)
diff --git a/test/antithesis/push-antithesis.py b/test/antithesis/push-antithesis.py
index 2787f5cee8f30..fe1dc7555ea74 100755
--- a/test/antithesis/push-antithesis.py
+++ b/test/antithesis/push-antithesis.py
@@ -37,6 +37,9 @@
 # Images Antithesis needs to be able to pull:
 #   - antithesis-config holds the docker-compose.yaml + .env Antithesis runs.
 #   - materialized + antithesis-workload are referenced by that compose.
+# Keep this list in sync with the `antithesis_images` branch in
+# ci/test/build.py — that's where CI_ANTITHESIS scopes the mzbuild walk so
+# the nightly doesn't waste time building images Antithesis never consumes.
 ANTITHESIS_IMAGES = ["materialized", "antithesis-workload", "antithesis-config"]
 
 

From 0a1fa97d3510001d54134c98ad953e070097282e Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Mon, 11 May 2026 20:48:21 -0400
Subject: [PATCH 23/65] test/antithesis: pre-create kafka topics before CREATE
 SOURCE

---
 test/antithesis/workload/test/helper_none_source.py   |  6 ++++++
 test/antithesis/workload/test/helper_upsert_source.py | 10 +++++++---
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/test/antithesis/workload/test/helper_none_source.py b/test/antithesis/workload/test/helper_none_source.py
index e9ecb358675c8..a3cb3c1704be1 100644
--- a/test/antithesis/workload/test/helper_none_source.py
+++ b/test/antithesis/workload/test/helper_none_source.py
@@ -21,6 +21,7 @@
 import logging
 import os
 
+from helper_kafka import ensure_topic
 from helper_pg import execute_retry
 from helper_upsert_source import ensure_kafka_connection
 
@@ -40,6 +41,11 @@ def ensure_none_text_source() -> None:
     drivers don't proliferate connections.
     """
     ensure_kafka_connection()
+    # CREATE SOURCE issues a Kafka metadata fetch that fails fast if the topic
+    # is missing; broker auto-create only fires on a producer write, which
+    # comes later in the driver. Pre-create via admin client so the metadata
+    # fetch succeeds on the first run.
+    ensure_topic(TOPIC_NONE_TEXT)
     execute_retry(
         f"CREATE SOURCE IF NOT EXISTS {SOURCE_NONE_TEXT} "
         f"IN CLUSTER {CLUSTER} "
diff --git a/test/antithesis/workload/test/helper_upsert_source.py b/test/antithesis/workload/test/helper_upsert_source.py
index 59332b28d64e9..e5c8ac1cc6e6a 100644
--- a/test/antithesis/workload/test/helper_upsert_source.py
+++ b/test/antithesis/workload/test/helper_upsert_source.py
@@ -9,9 +9,11 @@
 
 """Idempotent setup for the Antithesis UPSERT-envelope Kafka source.
 
-Used by all drivers that exercise UPSERT semantics. Topic is auto-created by
-the Kafka broker on first produce; the source/connection are created at most
-once across all drivers (CREATE ... IF NOT EXISTS).
+Used by all drivers that exercise UPSERT semantics. The topic is pre-created
+via the Kafka admin client (broker auto-create only triggers on producer
+write, but CREATE SOURCE does a metadata fetch that fails fast otherwise).
+The source/connection are created at most once across all drivers
+(CREATE ... IF NOT EXISTS).
 """
 
 from __future__ import annotations
@@ -19,6 +21,7 @@
 import logging
 import os
 
+from helper_kafka import ensure_topic
 from helper_pg import execute_retry
 
 LOG = logging.getLogger("antithesis.helper_upsert_source")
@@ -44,6 +47,7 @@ def ensure_upsert_text_source() -> None:
     The resulting source has columns `key TEXT NOT NULL` and `text TEXT`.
     """
     ensure_kafka_connection()
+    ensure_topic(TOPIC_UPSERT_TEXT)
     execute_retry(
         f"CREATE SOURCE IF NOT EXISTS {SOURCE_UPSERT_TEXT} "
         f"IN CLUSTER {CLUSTER} "

From 624149c959c866bd8a0ec49e06dac293047d732a Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Mon, 11 May 2026 22:10:44 -0400
Subject: [PATCH 24/65] test/antithesis: tolerate orphan _progress collision +
 add upsert-v2 first_ selector

---
 .../first_select_upsert_implementation.py     | 61 +++++++++++++++++++
 .../workload/test/helper_none_source.py       |  7 ++-
 test/antithesis/workload/test/helper_pg.py    | 59 ++++++++++++++++++
 .../workload/test/helper_upsert_source.py     |  7 ++-
 4 files changed, 128 insertions(+), 6 deletions(-)
 create mode 100755 test/antithesis/workload/test/first_select_upsert_implementation.py

diff --git a/test/antithesis/workload/test/first_select_upsert_implementation.py b/test/antithesis/workload/test/first_select_upsert_implementation.py
new file mode 100755
index 0000000000000..03394a1ebd7f7
--- /dev/null
+++ b/test/antithesis/workload/test/first_select_upsert_implementation.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python3
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Antithesis test command: pick v1 or v2 of the upsert continual feedback
+operator at the start of each timeline.
+
+The selection is made via `helper_random.random_u64()` (routes through the
+Antithesis SDK for deterministic replay) and applied via `ALTER SYSTEM SET
+enable_upsert_v2 = ...` against the `mz_system` internal port. Because this
+script is a `first_*` Test Composer action it runs after `setup-complete`
+but before any `parallel_driver_*` / `singleton_driver_*` creates a source,
+so every source rendered in this timeline reads the chosen value.
+
+Each branch records a `sometimes` assertion so Antithesis surfaces "v1
+covered" and "v2 covered" as separate dashboard signals — if either ever
+goes 0/N across the run, we've lost that arm of coverage.
+"""
+
+from __future__ import annotations
+
+import logging
+import sys
+
+import helper_random
+from helper_pg import execute_internal_retry
+
+from antithesis.assertions import sometimes
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
+)
+LOG = logging.getLogger("first.select_upsert_implementation")
+
+
+def main() -> int:
+    # Low bit of a SDK-sourced u64 — under Antithesis this routes through the
+    # SDK so timeline replay picks the same arm; outside Antithesis it falls
+    # back to a stdlib-seeded RNG (see helper_random).
+    enable_v2 = (helper_random.random_u64() & 1) == 1
+    LOG.info("rolled enable_upsert_v2=%s for this timeline", enable_v2)
+
+    # Set explicitly in both branches so the chosen value is part of the
+    # timeline's recorded state, not implicit in the bootstrap default.
+    if enable_v2:
+        execute_internal_retry("ALTER SYSTEM SET enable_upsert_v2 = true")
+        sometimes(True, "upsert continual feedback v2 enabled for timeline", {})
+    else:
+        execute_internal_retry("ALTER SYSTEM SET enable_upsert_v2 = false")
+        sometimes(True, "upsert continual feedback v1 enabled for timeline", {})
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/test/antithesis/workload/test/helper_none_source.py b/test/antithesis/workload/test/helper_none_source.py
index a3cb3c1704be1..87a90b1ac6087 100644
--- a/test/antithesis/workload/test/helper_none_source.py
+++ b/test/antithesis/workload/test/helper_none_source.py
@@ -22,7 +22,7 @@
 import os
 
 from helper_kafka import ensure_topic
-from helper_pg import execute_retry
+from helper_pg import create_source_idempotent
 from helper_upsert_source import ensure_kafka_connection
 
 LOG = logging.getLogger("antithesis.helper_none_source")
@@ -46,13 +46,14 @@ def ensure_none_text_source() -> None:
     # comes later in the driver. Pre-create via admin client so the metadata
     # fetch succeeds on the first run.
     ensure_topic(TOPIC_NONE_TEXT)
-    execute_retry(
+    create_source_idempotent(
         f"CREATE SOURCE IF NOT EXISTS {SOURCE_NONE_TEXT} "
         f"IN CLUSTER {CLUSTER} "
         f"FROM KAFKA CONNECTION antithesis_kafka_conn (TOPIC '{TOPIC_NONE_TEXT}') "
         f"FORMAT TEXT "
         f"INCLUDE PARTITION, OFFSET "
-        f"ENVELOPE NONE"
+        f"ENVELOPE NONE",
+        SOURCE_NONE_TEXT,
     )
     LOG.info(
         "none-envelope source %s ready (topic=%s)", SOURCE_NONE_TEXT, TOPIC_NONE_TEXT
diff --git a/test/antithesis/workload/test/helper_pg.py b/test/antithesis/workload/test/helper_pg.py
index d90babf162baf..d336905b1914b 100644
--- a/test/antithesis/workload/test/helper_pg.py
+++ b/test/antithesis/workload/test/helper_pg.py
@@ -31,6 +31,10 @@
 PGUSER = os.environ.get("PGUSER", "materialize")
 PGDATABASE = os.environ.get("PGDATABASE", "materialize")
 
+# Internal pgwire endpoint for system-privileged operations (ALTER SYSTEM SET).
+PGPORT_INTERNAL = int(os.environ.get("PGPORT_INTERNAL", "6877"))
+PGUSER_INTERNAL = os.environ.get("PGUSER_INTERNAL", "mz_system")
+
 # Retry tuning. Antithesis injects partitions and node hangs; conservative bounds
 # keep drivers progressing without masking real correctness signals.
 _CONNECT_TIMEOUT_S = 5
@@ -118,3 +122,58 @@ def query_one_retry(
 ) -> tuple[Any, ...] | None:
     rows = query_retry(sql, params)
     return rows[0] if rows else None
+
+
+def execute_internal_retry(sql: str, params: Sequence[Any] | None = None) -> None:
+    """Execute a system-privileged statement on the internal port (mz_system).
+
+    Used for ALTER SYSTEM SET and other operations the regular `materialize`
+    role cannot perform. Retries the same transient errors as `execute_retry`.
+    """
+    deadline = time.monotonic() + _RETRY_BUDGET_S
+    backoff = _RETRY_INITIAL_S
+    while True:
+        try:
+            with psycopg.connect(
+                host=PGHOST,
+                port=PGPORT_INTERNAL,
+                user=PGUSER_INTERNAL,
+                dbname=PGDATABASE,
+                connect_timeout=_CONNECT_TIMEOUT_S,
+                autocommit=True,
+            ) as conn, conn.cursor() as cur:
+                cur.execute(sql, params or ())
+            return
+        except Exception as exc:  # noqa: BLE001
+            if not _retryable(exc) or time.monotonic() > deadline:
+                raise
+            LOG.info("pg internal execute retrying after %s", exc)
+            time.sleep(backoff)
+            backoff = min(backoff * 2, _RETRY_MAX_S)
+
+
+def create_source_idempotent(create_sql: str, source_name: str) -> None:
+    """Run a CREATE SOURCE statement, tolerating IF-NOT-EXISTS race gaps.
+
+    `CREATE SOURCE IF NOT EXISTS` only short-circuits on the primary source
+    name. When two driver invocations race past the existence check, or when
+    a fault-injected crash mid-DDL leaves an orphan `<name>_progress`
+    subsource in the catalog, the primary create errors with "catalog item
+    ... already exists" despite `IF NOT EXISTS`. Re-check `mz_sources` after
+    such an error; if the source landed concurrently, treat as success.
+    Otherwise re-raise so a true orphan still surfaces.
+    """
+    try:
+        execute_retry(create_sql)
+        return
+    except psycopg.errors.InternalError as exc:
+        if "already exists" not in str(exc):
+            raise
+        rows = query_retry(
+            "SELECT 1 FROM mz_sources WHERE name = %s",
+            (source_name,),
+        )
+        if rows:
+            LOG.info("source %s landed concurrently; tolerating collision", source_name)
+            return
+        raise
diff --git a/test/antithesis/workload/test/helper_upsert_source.py b/test/antithesis/workload/test/helper_upsert_source.py
index e5c8ac1cc6e6a..6fac93cdd4f24 100644
--- a/test/antithesis/workload/test/helper_upsert_source.py
+++ b/test/antithesis/workload/test/helper_upsert_source.py
@@ -22,7 +22,7 @@
 import os
 
 from helper_kafka import ensure_topic
-from helper_pg import execute_retry
+from helper_pg import create_source_idempotent, execute_retry
 
 LOG = logging.getLogger("antithesis.helper_upsert_source")
 
@@ -48,11 +48,12 @@ def ensure_upsert_text_source() -> None:
     """
     ensure_kafka_connection()
     ensure_topic(TOPIC_UPSERT_TEXT)
-    execute_retry(
+    create_source_idempotent(
         f"CREATE SOURCE IF NOT EXISTS {SOURCE_UPSERT_TEXT} "
         f"IN CLUSTER {CLUSTER} "
         f"FROM KAFKA CONNECTION {CONNECTION_NAME} (TOPIC '{TOPIC_UPSERT_TEXT}') "
         f"KEY FORMAT TEXT VALUE FORMAT TEXT "
-        f"ENVELOPE UPSERT"
+        f"ENVELOPE UPSERT",
+        SOURCE_UPSERT_TEXT,
     )
     LOG.info("upsert source %s ready (topic=%s)", SOURCE_UPSERT_TEXT, TOPIC_UPSERT_TEXT)

From 520f9087d0d742d4a4b1200dc17e3bf593a954c0 Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Tue, 12 May 2026 00:08:38 -0400
Subject: [PATCH 25/65] test/antithesis: add four workload drivers + reclock
 SUT anchor for catalog properties

---
 src/storage/src/source/reclock.rs             |  26 +-
 .../scratchbook/property-catalog.md           |   8 +-
 .../test/anytime_fault_recovery_exercised.py  | 183 +++++++++++++
 ..._kafka_offset_known_not_below_committed.py | 122 +++++++++
 ...nytime_kafka_source_resumes_after_fault.py | 245 ++++++++++++++++++
 .../workload/test/helper_table_mv.py          |  64 +++++
 ...rallel_driver_mv_reflects_table_updates.py | 162 ++++++++++++
 7 files changed, 808 insertions(+), 2 deletions(-)
 create mode 100755 test/antithesis/workload/test/anytime_fault_recovery_exercised.py
 create mode 100755 test/antithesis/workload/test/anytime_kafka_offset_known_not_below_committed.py
 create mode 100755 test/antithesis/workload/test/anytime_kafka_source_resumes_after_fault.py
 create mode 100644 test/antithesis/workload/test/helper_table_mv.py
 create mode 100755 test/antithesis/workload/test/parallel_driver_mv_reflects_table_updates.py

diff --git a/src/storage/src/source/reclock.rs b/src/storage/src/source/reclock.rs
index d4ab5ac4b312b..745115e5dbf72 100644
--- a/src/storage/src/source/reclock.rs
+++ b/src/storage/src/source/reclock.rs
@@ -10,11 +10,13 @@
 /// The `ReclockOperator` observes the progress of a stream that is
 /// timestamped with some source time `FromTime` and generates bindings that describe how the
 /// collection should evolve in target time `IntoTime`.
+use antithesis_sdk::assert_reachable;
 use differential_dataflow::consolidation;
 use differential_dataflow::lattice::Lattice;
 use mz_persist_client::error::UpperMismatch;
 use mz_repr::Diff;
 use mz_storage_client::util::remap_handle::RemapHandle;
+use serde_json::json;
 use timely::order::PartialOrder;
 use timely::progress::Timestamp;
 use timely::progress::frontier::{Antichain, AntichainRef, MutableAntichain};
@@ -128,6 +130,12 @@ where
             upper: self.upper.clone(),
         };
 
+        // Tracks whether append_batch hit an UpperMismatch during this mint
+        // invocation. If true and we still exit the while loop normally,
+        // we've exercised the retry path covered by the catalog property
+        // `reclock-mint-eventually-succeeds`.
+        let mut cas_retry_count: u64 = 0;
+
         while *self.upper == [IntoTime::minimum()]
             || (PartialOrder::less_equal(&self.source_upper.frontier(), &new_from_upper)
                 && PartialOrder::less_than(&self.upper, &new_into_upper)
@@ -159,12 +167,28 @@ where
 
             let new_batch = match self.append_batch(updates, &new_into_upper).await {
                 Ok(trace_batch) => trace_batch,
-                Err(UpperMismatch { current, .. }) => self.sync(current.borrow()).await,
+                Err(UpperMismatch { current, .. }) => {
+                    cas_retry_count = cas_retry_count.saturating_add(1);
+                    self.sync(current.borrow()).await
+                }
             };
             batch.updates.extend(new_batch.updates);
             batch.upper = new_batch.upper;
         }
 
+        // Reachability anchor for `reclock-mint-eventually-succeeds`: this
+        // line fires only when a CaS UpperMismatch was observed and the
+        // mint loop nonetheless terminated. That's the path the catalog
+        // wants Antithesis to observe at least once per run; reaching it
+        // is the signal, so the marker is unconditional `assert_reachable!`
+        // rather than `assert_sometimes!(true, …)`.
+        if cas_retry_count > 0 {
+            assert_reachable!(
+                "reclock: mint completed after at least one compare_and_append UpperMismatch",
+                &json!({"cas_retry_count": cas_retry_count})
+            );
+        }
+
         batch
     }
 
diff --git a/test/antithesis/scratchbook/property-catalog.md b/test/antithesis/scratchbook/property-catalog.md
index 9e94cdf8ed089..8f3e2a2563d74 100644
--- a/test/antithesis/scratchbook/property-catalog.md
+++ b/test/antithesis/scratchbook/property-catalog.md
@@ -1,6 +1,6 @@
 ---
 commit: 007c7af9d9970fb2030c7212368b232e0fbc363e
-updated: 2026-05-11
+updated: 2026-05-12
 ---
 
 # Property Catalog: Materialize
@@ -189,6 +189,7 @@ Properties that verify the system reaches interesting states under fault injecti
 |---|---|
 | **Type** | Liveness |
 | **Priority** | P0 — most fundamental operational property; prerequisite for all others |
+| **Status** | **Implemented (workload-side)** — `test/antithesis/workload/test/anytime_fault_recovery_exercised.py`. Anytime driver probes `SELECT 1` with a short connect timeout (bypassing helper_pg's retry budget so the fault-active window is observable) and records `sometimes("...succeeded after a previously-observed connect failure", …)` for the recovery transition, plus corroborating `sometimes` anchors for "observed replica non-online" and "at least one probe succeeded this invocation". |
 | **Property** | After the coordinator (environmentd) crashes and restarts, the system eventually becomes healthy (readiness endpoint returns 200) and can serve SQL queries. |
 | **Invariant** | `Sometimes(healthy_after_crash)`: the system must reach a state where it can serve queries after a crash. This confirms recovery works end-to-end, not just in unit tests. |
 | **Antithesis Angle** | Kill environmentd at various points during operation. Verify it restarts, reconnects to persist, recovers catalog, and serves queries. Antithesis explores crash timing — during DDL, during peek, during group_commit. |
@@ -211,6 +212,7 @@ Properties that verify the system reaches interesting states under fault injecti
 |---|---|
 | **Type** | Liveness |
 | **Priority** | P1 — end-to-end user-visible correctness; Materialize's core value |
+| **Status** | **Implemented (workload-side, table-backed)** — `test/antithesis/workload/test/parallel_driver_mv_reflects_table_updates.py` + `helper_table_mv.py`. Each invocation inserts N rows tagged with a per-invocation prefix into `mv_input_table`, polls the rolling-count MV `mv_input_count` after a quiet period, and pairs `sometimes("mv: row_count caught up …", …)` (liveness anchor) with `always("mv: row_count equals inserted count …", …)` (safety on the settled count). Kafka-source-backed MV is covered indirectly by the Kafka-source drivers — direct MV-on-Kafka-source coverage is deferred. |
 | **Property** | After data is written to a source, materialized views that depend on that source eventually reflect the new data. |
 | **Invariant** | `Sometimes(mv_contains_new_data)`: after inserting data into a table or producing to a Kafka source, a SELECT on a dependent materialized view must eventually return the new data. |
 | **Antithesis Angle** | Insert data, inject faults (compute replica crash, storage reconnection), then verify the MV eventually shows the data. Antithesis explores whether faults during the incremental update pipeline cause permanent stalls. |
@@ -262,6 +264,7 @@ Properties specific to the Kafka source ingestion pipeline: `KafkaSourceReader`
 |---|---|
 | **Type** | Liveness |
 | **Priority** | P1 — operational expectation; broker faults are a routine condition |
+| **Status** | **Implemented (workload-side, shared driver)** — `test/antithesis/workload/test/anytime_kafka_source_resumes_after_fault.py`. Continuous polling state machine per Kafka source: `OBSERVING` -> `STALLED` after N consecutive identical `offset_committed` samples, then `Reachable("...resumed advancing after a sustained stall", …)` on the first strictly-greater sample. The driver tags each recovery with `saw_kafka_metadata_failure` (broker-fault signal) and `saw_replica_non_online` (clusterd-restart signal) so triage can distinguish the two fault classes. |
 | **Property** | After a transient network partition or Kafka broker outage that prevents the source from making progress, once connectivity is restored, the source eventually ingests all messages that were produced during the outage. |
 | **Invariant** | `Sometimes(source_resumes_after_broker_fault)`: at least once per run, after injecting a network fault between materialized and Kafka and then calling `ANTITHESIS_STOP_FAULTS`, the workload observes the source's `COUNT(*)` advance past its pre-fault value. |
 | **Antithesis Angle** | Network partition between the `materialized` container and the Kafka container; persist+metadata stay reachable. Tests rdkafka reconnect, snapshot statistics restoration (commit 0a34b6c79d), and that no permanent stall mode is entered. |
@@ -273,6 +276,7 @@ Properties specific to the Kafka source ingestion pipeline: `KafkaSourceReader`
 |---|---|
 | **Type** | Liveness |
 | **Priority** | P1 — recovery from clusterd kill is the most common operational fault path |
+| **Status** | **Implemented (workload-side, shared driver)** — same `test/antithesis/workload/test/anytime_kafka_source_resumes_after_fault.py` as `kafka-source-survives-broker-fault`. The stall-then-advance transition is fault-kind-agnostic; `saw_replica_non_online` corroborates that the source recovered specifically from a clusterd kill. Combines with the existing `kafka-source-no-data-duplication` and `kafka-source-no-data-loss` assertions to also rule out double-counting and gaps on the rehydrated path. Requires node-termination faults to be enabled in the Antithesis tenant. |
 | **Property** | After clusterd (storage worker) is killed and restarted, the Kafka source recovers, replays the right resume offsets, and ingests messages produced before, during, and after the restart. |
 | **Invariant** | `Sometimes(source_recovered_after_clusterd_restart)`: after a kill+restart, eventually `COUNT(*) FROM source >= produced_count`. Combined with `kafka-source-no-data-duplication` to also rule out double-counting. |
 | **Antithesis Angle** | Direct test of the `storage-command-replay-idempotent` mechanism end-to-end through Kafka. Antithesis explores crash timing across the reclock mint, persist-sink append, and upsert snapshot-completion windows. Requires node-termination faults to be enabled. |
@@ -390,6 +394,7 @@ Properties specific to the Kafka source ingestion pipeline: `KafkaSourceReader`
 |---|---|
 | **Type** | Liveness |
 | **Priority** | P2 — pre-existing concern under persist instability |
+| **Status** | **Implemented (SUT-side anchor)** — `src/storage/src/source/reclock.rs`: `ReclockOperator::mint` carries a local `cas_retry_count` and fires `assert_reachable!("reclock: mint completed after at least one compare_and_append UpperMismatch", …)` after the while-loop terminates when at least one `UpperMismatch` was observed. The reachability anchor covers the "retry path was exercised AND mint terminated" half of the property. The workload-side "source frontier advanced past the contention point" liveness check is approximated by the existing `anytime_kafka_frontier_monotonic.py` + `anytime_kafka_source_resumes_after_fault.py` drivers and is not duplicated here. |
 | **Property** | Under transient persist outages or competing writers, the reclock mint loop (`compare_and_append` with `UpperMismatch` retry, reclock.rs:160-166) eventually completes for every source-frontier advance that has data to bind. |
 | **Invariant** | `Sometimes(mint_completed_after_cas_retry)`: at least once per run, Antithesis observes a reclock mint that took >1 CaS attempt and then completed (i.e. a successful retry path was exercised). Critically, the workload should also observe that the source frontier eventually advances past the value of `source_upper` captured at the time of the contention — i.e. the loop is not livelocked. |
 | **Antithesis Angle** | Inject persist consensus latency, kill+restart concurrently to create a competing writer, race the metadata fetcher's partition-add against a mint that is already in flight. The retry loop in `mint()` has no upper bound; this property confirms it is not livelocked even under adversarial schedules. |
@@ -401,6 +406,7 @@ Properties specific to the Kafka source ingestion pipeline: `KafkaSourceReader`
 |---|---|
 | **Type** | Safety |
 | **Priority** | P2 — observable statistics correctness; regression target for commit 3e32df1f69 |
+| **Status** | **Implemented (workload-side)** — `test/antithesis/workload/test/anytime_kafka_offset_known_not_below_committed.py`. Continuous polling driver queries every Kafka source's `mz_source_statistics_per_worker` row and fires `always("kafka: source offset_known < offset_committed", …)` whenever a single per-worker row has `offset_known < offset_committed`. Both fields are read from the same row of the same query so the comparison cannot cross a metric-update boundary. The SUT-side mirror in `src/storage/src/statistics.rs` is deferred. |
 | **Property** | For every Kafka source, the source-statistics view always reports `offset_known >= offset_committed`. The metric `offset_known` reflects what the broker has told us is available; `offset_committed` reflects what Materialize has durably ingested. Causally, `offset_known` cannot lag `offset_committed`. |
 | **Invariant** | `Always`: a polling assertion in the workload — `SELECT offset_known, offset_committed FROM mz_internal.mz_source_statistics_per_worker WHERE id = ?` — invariant `offset_known >= offset_committed`. Mirror as an `assert_always!` inside the statistics update path in `src/storage/src/statistics.rs`. |
 | **Antithesis Angle** | Clusterd restart resets `offset_known` to broker-reported watermark while `offset_committed` is restored from persist. If the restoration order is wrong, the invariant flips. Direct regression target for commit 3e32df1f69. |
diff --git a/test/antithesis/workload/test/anytime_fault_recovery_exercised.py b/test/antithesis/workload/test/anytime_fault_recovery_exercised.py
new file mode 100755
index 0000000000000..143dd8c103dce
--- /dev/null
+++ b/test/antithesis/workload/test/anytime_fault_recovery_exercised.py
@@ -0,0 +1,183 @@
+#!/usr/bin/env python3
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Antithesis driver for `fault-recovery-exercised`.
+
+The most fundamental liveness property in the catalog: after the system
+takes a hit from Antithesis fault injection, it must eventually come back
+and serve SQL again. The catalog frames this in terms of the `/health/ready`
+endpoint returning 200; this workload uses `SELECT 1` (the cheapest
+end-to-end pgwire round trip) as the proxy, and observes the cluster
+replica status as a corroborating signal.
+
+Approach:
+  - Probe `materialized` with a *short-budget* psycopg connect on every
+    tick. Long retry budgets in `helper_pg` would mask the actual
+    fault-active periods we want to detect — here we want to observe the
+    transitions.
+  - Track per-tick state: was this probe a success or a connect/query
+    failure?
+  - If we observe a failure at tick T and a success at tick T+k (any k>=1)
+    within this invocation, that is the recovery transition we care about,
+    and we fire `sometimes("...query succeeded after observed fault")`.
+
+  - Separately, fire `sometimes("...observed cluster replica non-online")`
+    when `mz_cluster_replica_statuses` reports any antithesis replica
+    `offline`. This is a corroborating signal so triage can distinguish
+    "no fault ever landed" from "faults landed but no recovery observed."
+
+This is an `anytime_` driver — Antithesis launches it many times, each
+short-lived. Recovery transitions accumulate across invocations.
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+import sys
+import time
+
+import psycopg
+from helper_pg import (
+    PGDATABASE,
+    PGHOST,
+    PGPORT,
+    PGUSER,
+    query_one_retry,
+)
+
+from antithesis.assertions import sometimes
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
+)
+LOG = logging.getLogger("driver.fault_recovery_exercised")
+
+POLL_INTERVAL_S = 0.5
+RUN_BUDGET_S = 30.0
+PROBE_CONNECT_TIMEOUT_S = 2.0
+
+ANTITHESIS_CLUSTER = "antithesis_cluster"
+
+
+def _probe_select_one() -> bool:
+    """Run `SELECT 1` with a short connect timeout. Return True on success.
+
+    Distinct from the resilient `helper_pg.query_*` paths because we *want*
+    to observe transient failures here — they are the fault-active half of
+    the recovery transition we are looking for.
+    """
+    try:
+        with psycopg.connect(
+            host=PGHOST,
+            port=PGPORT,
+            user=PGUSER,
+            dbname=PGDATABASE,
+            connect_timeout=int(PROBE_CONNECT_TIMEOUT_S),
+            autocommit=True,
+        ) as conn, conn.cursor() as cur:
+            cur.execute("SELECT 1")
+            row = cur.fetchone()
+            return row is not None and row[0] == 1
+    except Exception:  # noqa: BLE001
+        return False
+
+
+def _replica_non_online() -> bool:
+    """Best-effort: is any antithesis-cluster replica reporting non-online?
+
+    Uses the retry-budgeted query helper because we want a clear yes/no, not
+    a probe outcome — if the helper can't get an answer we conservatively
+    return False so the corroborating signal stays silent rather than
+    accidentally firing on a probe-side failure.
+    """
+    try:
+        row = query_one_retry(
+            """
+            SELECT EXISTS (
+                SELECT 1
+                FROM mz_internal.mz_cluster_replica_statuses s
+                JOIN mz_cluster_replicas r ON r.id = s.replica_id
+                JOIN mz_clusters c ON c.id = r.cluster_id
+                WHERE c.name = %s AND s.status != 'online'
+            )
+            """,
+            (ANTITHESIS_CLUSTER,),
+        )
+    except Exception:  # noqa: BLE001
+        return False
+    return bool(row and row[0])
+
+
+def main() -> int:
+    deadline = time.monotonic() + RUN_BUDGET_S
+
+    # Per-invocation state. The driver is short-lived; Antithesis covers the
+    # full timeline by launching many invocations.
+    saw_failure = False
+    saw_recovery_after_failure = False
+    saw_replica_non_online = False
+    successes = 0
+    failures = 0
+
+    while time.monotonic() < deadline:
+        ok = _probe_select_one()
+        if ok:
+            successes += 1
+            if saw_failure:
+                saw_recovery_after_failure = True
+        else:
+            failures += 1
+            saw_failure = True
+
+        if _replica_non_online():
+            saw_replica_non_online = True
+
+        time.sleep(POLL_INTERVAL_S)
+
+    sometimes(
+        saw_recovery_after_failure,
+        "fault recovery: SELECT 1 succeeded after a previously-observed connect failure",
+        {
+            "successes": successes,
+            "failures": failures,
+            "saw_replica_non_online": saw_replica_non_online,
+        },
+    )
+    sometimes(
+        saw_replica_non_online,
+        "fault recovery: observed antithesis_cluster replica non-online at least once",
+        {"successes": successes, "failures": failures},
+    )
+    # Bare-minimum healthy-coverage signal: at least one successful probe in
+    # the invocation. If this ever goes 0/N across a run, no driver was
+    # ever able to talk to Materialize and the entire test is suspect —
+    # downstream property assertions would be vacuous.
+    sometimes(
+        successes > 0,
+        "fault recovery: at least one SELECT 1 succeeded this invocation",
+        {"successes": successes, "failures": failures},
+    )
+
+    LOG.info(
+        "fault-recovery probe done; successes=%d failures=%d recovery=%s replica_offline=%s",
+        successes,
+        failures,
+        saw_recovery_after_failure,
+        saw_replica_non_online,
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    # Reference PGUSER/PGPORT/PGHOST/PGDATABASE so static analysis sees them
+    # used through helper_pg's re-export rather than as dead imports.
+    _ = (PGHOST, PGPORT, PGUSER, PGDATABASE, os)
+    sys.exit(main())
diff --git a/test/antithesis/workload/test/anytime_kafka_offset_known_not_below_committed.py b/test/antithesis/workload/test/anytime_kafka_offset_known_not_below_committed.py
new file mode 100755
index 0000000000000..9801c4dfa65b7
--- /dev/null
+++ b/test/antithesis/workload/test/anytime_kafka_offset_known_not_below_committed.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python3
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Antithesis driver for `offset-known-not-below-committed`.
+
+For every Kafka source, `mz_internal.mz_source_statistics_per_worker` must
+always report `offset_known >= offset_committed`. `offset_known` reflects
+what the broker has told us is available; `offset_committed` reflects what
+Materialize has durably ingested. Causally, the broker's idea of "this
+offset exists" cannot lag what we've already durably read past it. Direct
+regression target for commit 3e32df1f69, which clamped the metric to
+prevent this flip on the first sample after a clusterd restart.
+
+This is an `anytime_` driver — it runs continuously throughout the timeline
+under active fault injection. The interesting timing per the catalog is the
+very first sample after a clusterd restart, where `offset_known` is
+restored from the broker watermark while `offset_committed` is restored
+from persist; we want Antithesis to drop a poll into that window.
+
+Both fields are read in the same row of the same SELECT so the comparison
+never crosses a metric-update boundary. The per-worker view is queried
+(not the rolled-up `mz_source_statistics`) because the invariant must hold
+per worker — averaging would mask a single worker that crossed the line.
+
+Errors during polling (clusterd down, network partitioned) are *expected*
+under fault injection and must not produce false-positive failures; we
+just skip the sample.
+"""
+
+from __future__ import annotations
+
+import logging
+import sys
+import time
+
+from helper_pg import query_retry
+
+from antithesis.assertions import always
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
+)
+LOG = logging.getLogger("driver.kafka_offset_known_not_below_committed")
+
+POLL_INTERVAL_S = 0.5
+RUN_BUDGET_S = 30.0
+
+ANTITHESIS_CLUSTER = "antithesis_cluster"
+
+
+def _samples() -> list[tuple[str, int, int, int]]:
+    """Return (source_name, worker_id, offset_known, offset_committed) per worker.
+
+    Joins `mz_source_statistics_per_worker` to `mz_sources` so the assertion
+    `details` can name the source by name rather than by opaque id. Filters
+    to Kafka sources owned by the antithesis cluster so the assertion does
+    not fire against the introspection cluster's bookkeeping sources.
+
+    Rows with NULL `offset_known` or `offset_committed` are dropped — those
+    are early-lifetime samples that have not been populated yet.
+    """
+    rows = query_retry(
+        """
+        SELECT
+            s.name,
+            ss.worker_id::bigint,
+            ss.offset_known::bigint,
+            ss.offset_committed::bigint
+        FROM mz_internal.mz_source_statistics_per_worker ss
+        JOIN mz_sources s ON s.id = ss.id
+        JOIN mz_clusters c ON c.id = s.cluster_id
+        WHERE c.name = %s
+          AND s.type = 'kafka'
+          AND ss.offset_known IS NOT NULL
+          AND ss.offset_committed IS NOT NULL
+        """,
+        (ANTITHESIS_CLUSTER,),
+    )
+    return [(str(n), int(w), int(k), int(o)) for (n, w, k, o) in rows]
+
+
+def main() -> int:
+    deadline = time.monotonic() + RUN_BUDGET_S
+    polled = 0
+
+    while time.monotonic() < deadline:
+        try:
+            samples = _samples()
+        except Exception as exc:  # noqa: BLE001
+            LOG.info("source stats query failed: %s; sleeping and retrying", exc)
+            time.sleep(POLL_INTERVAL_S)
+            continue
+
+        for source, worker, known, committed in samples:
+            always(
+                known >= committed,
+                "kafka: source offset_known < offset_committed",
+                {
+                    "source": source,
+                    "worker_id": worker,
+                    "offset_known": known,
+                    "offset_committed": committed,
+                    "deficit": committed - known,
+                },
+            )
+            polled += 1
+
+        time.sleep(POLL_INTERVAL_S)
+
+    LOG.info("offset_known-not-below-committed check done; %d samples", polled)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/test/antithesis/workload/test/anytime_kafka_source_resumes_after_fault.py b/test/antithesis/workload/test/anytime_kafka_source_resumes_after_fault.py
new file mode 100755
index 0000000000000..85042a317d7cb
--- /dev/null
+++ b/test/antithesis/workload/test/anytime_kafka_source_resumes_after_fault.py
@@ -0,0 +1,245 @@
+#!/usr/bin/env python3
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Antithesis driver for `kafka-source-survives-broker-fault` and
+`kafka-source-survives-clusterd-restart` (combined liveness signal).
+
+Both catalog properties amount to: after a transient fault that prevents
+the source from making progress, once the fault is over the source must
+ingest the messages it was unable to read during the outage. Externally
+this looks identical for either fault kind — `offset_committed` stalls
+during the outage and resumes advancing afterward — so one anytime driver
+records the stall-then-advance transition and we tag the corroborating
+fault signal (kafka broker reachable / replica online) in `details` so
+triage can distinguish the two cases on a hit.
+
+Per-invocation state machine, per source:
+  - `IDLE` (initial). On a successful sample, store the offset and move
+    to `OBSERVING`.
+  - `OBSERVING`. If the sample equals the stored value for STALL_TICKS
+    consecutive ticks, move to `STALLED` (the source has stopped
+    progressing — most likely fault-induced). Otherwise, refresh the
+    stored value.
+  - `STALLED`. On any sample strictly greater than the stalled value, fire
+    the `sometimes(...)` recovery anchor and return to `OBSERVING` with
+    the new value. Otherwise stay stalled.
+
+Failed samples (clusterd unavailable, network partition) do not transition
+the state machine — they are the fault-active condition we want to bridge
+over. They are counted only so the `details` payload can corroborate the
+recovery transition.
+
+The driver also records two corroborating `sometimes(...)` signals so
+triage can confirm Antithesis actually hit each of the two fault classes
+this property cluster cares about:
+  - replica went non-online (clusterd-restart signal)
+  - direct Kafka admin metadata fetch failed (broker-fault signal)
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+import sys
+import time
+
+from helper_pg import query_one_retry, query_retry
+
+from antithesis.assertions import reachable, sometimes
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
+)
+LOG = logging.getLogger("driver.kafka_source_resumes_after_fault")
+
+POLL_INTERVAL_S = 1.0
+RUN_BUDGET_S = 45.0
+# Number of consecutive identical samples after which we consider the source
+# "stalled" rather than just briefly idle. Five seconds (5 ticks * 1s)
+# comfortably exceeds the natural quiet-period between produces but is well
+# below the fault-injection windows Antithesis schedules.
+STALL_TICKS = 5
+
+ANTITHESIS_CLUSTER = "antithesis_cluster"
+KAFKA_BROKER = os.environ.get("KAFKA_BROKER", "kafka:9092")
+
+
+def _kafka_sources() -> list[str]:
+    rows = query_retry(
+        """
+        SELECT s.name
+        FROM mz_sources s
+        JOIN mz_clusters c ON c.id = s.cluster_id
+        WHERE c.name = %s AND s.type = 'kafka'
+        """,
+        (ANTITHESIS_CLUSTER,),
+    )
+    return [r[0] for r in rows]
+
+
+def _offset_committed(source_name: str) -> int | None:
+    """Aggregated offset_committed across workers for `source_name`."""
+    row = query_one_retry(
+        """
+        SELECT MAX(ss.offset_committed)::bigint
+        FROM mz_internal.mz_source_statistics ss
+        JOIN mz_sources s ON s.id = ss.id
+        WHERE s.name = %s
+        """,
+        (source_name,),
+    )
+    if row is None or row[0] is None:
+        return None
+    return int(row[0])
+
+
+def _replica_non_online() -> bool:
+    try:
+        row = query_one_retry(
+            """
+            SELECT EXISTS (
+                SELECT 1
+                FROM mz_internal.mz_cluster_replica_statuses s
+                JOIN mz_cluster_replicas r ON r.id = s.replica_id
+                JOIN mz_clusters c ON c.id = r.cluster_id
+                WHERE c.name = %s AND s.status != 'online'
+            )
+            """,
+            (ANTITHESIS_CLUSTER,),
+        )
+    except Exception:  # noqa: BLE001
+        return False
+    return bool(row and row[0])
+
+
+def _kafka_metadata_failed() -> bool:
+    """Best-effort: did a direct Kafka metadata fetch fail?
+
+    A successful Materialize-side ingestion still goes through the broker,
+    so a metadata fetch failure here is a strong signal that the
+    `materialized <-> kafka` channel was partitioned even though the
+    `materialized <-> postgres-metadata` channel still works (the
+    `kafka-source-survives-broker-fault` shape).
+
+    Defensive imports because the kafka admin client only runs cleanly with
+    a reachable broker. We avoid raising into the polling loop.
+    """
+    try:
+        from confluent_kafka.admin import AdminClient
+    except Exception:  # noqa: BLE001
+        return False
+    try:
+        AdminClient({"bootstrap.servers": KAFKA_BROKER}).list_topics(timeout=2)
+        return False
+    except Exception:  # noqa: BLE001
+        return True
+
+
+def main() -> int:
+    deadline = time.monotonic() + RUN_BUDGET_S
+
+    # Per-source state machine.
+    #   state: "OBSERVING" or "STALLED"
+    #   last_value: most recent committed offset observed
+    #   stall_streak: consecutive ticks at last_value
+    states: dict[str, dict] = {}
+
+    # Cross-source corroborating signals collected throughout this run.
+    saw_replica_non_online = False
+    saw_kafka_metadata_failure = False
+    # Per-source: did we observe stall->advance at least once.
+    resumed_after_stall: dict[str, bool] = {}
+
+    while time.monotonic() < deadline:
+        if _replica_non_online():
+            saw_replica_non_online = True
+        if _kafka_metadata_failed():
+            saw_kafka_metadata_failure = True
+
+        try:
+            sources = _kafka_sources()
+        except Exception as exc:  # noqa: BLE001
+            LOG.info("source list query failed: %s; sleeping", exc)
+            time.sleep(POLL_INTERVAL_S)
+            continue
+
+        for source in sources:
+            try:
+                observed = _offset_committed(source)
+            except Exception as exc:  # noqa: BLE001
+                LOG.info("offset_committed query failed for %s: %s", source, exc)
+                continue
+            if observed is None:
+                continue
+
+            st = states.setdefault(
+                source,
+                {"state": "OBSERVING", "last_value": observed, "stall_streak": 0},
+            )
+
+            if st["state"] == "OBSERVING":
+                if observed == st["last_value"]:
+                    st["stall_streak"] += 1
+                    if st["stall_streak"] >= STALL_TICKS:
+                        st["state"] = "STALLED"
+                else:
+                    # Progress: reset.
+                    st["last_value"] = observed
+                    st["stall_streak"] = 0
+            else:  # STALLED
+                if observed > st["last_value"]:
+                    # Recovery transition: fire the per-source signal once
+                    # per invocation (we still update state so we can detect
+                    # additional stalls and resumes).
+                    if not resumed_after_stall.get(source, False):
+                        resumed_after_stall[source] = True
+                        # Reaching here is the property: a source was stalled,
+                        # then advanced. Use `reachable(...)` rather than
+                        # `sometimes(True, ...)` per the SDK assertion-type
+                        # guidance.
+                        reachable(
+                            "kafka source: offset_committed resumed advancing after a sustained stall",
+                            {
+                                "source": source,
+                                "stalled_at": st["last_value"],
+                                "observed_after_recovery": observed,
+                                "stall_ticks_required": STALL_TICKS,
+                                "saw_replica_non_online": saw_replica_non_online,
+                                "saw_kafka_metadata_failure": saw_kafka_metadata_failure,
+                            },
+                        )
+                    st["state"] = "OBSERVING"
+                    st["last_value"] = observed
+                    st["stall_streak"] = 0
+
+        time.sleep(POLL_INTERVAL_S)
+
+    sometimes(
+        saw_replica_non_online,
+        "kafka source resumes: observed antithesis_cluster replica non-online",
+        {"resumed_sources": sorted(resumed_after_stall.keys())},
+    )
+    sometimes(
+        saw_kafka_metadata_failure,
+        "kafka source resumes: observed direct Kafka metadata fetch failure",
+        {"resumed_sources": sorted(resumed_after_stall.keys())},
+    )
+
+    LOG.info(
+        "kafka-source-resumes-after-fault done; sources_resumed=%d replica_offline=%s metadata_failed=%s",
+        sum(1 for v in resumed_after_stall.values() if v),
+        saw_replica_non_online,
+        saw_kafka_metadata_failure,
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/test/antithesis/workload/test/helper_table_mv.py b/test/antithesis/workload/test/helper_table_mv.py
new file mode 100644
index 0000000000000..e865f3f2f5e89
--- /dev/null
+++ b/test/antithesis/workload/test/helper_table_mv.py
@@ -0,0 +1,64 @@
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Idempotent setup for the Antithesis table + materialized view scaffolding.
+
+Used by the `mv-reflects-source-updates` driver. The table holds rows with a
+per-invocation `prefix` so concurrent driver instances scope to disjoint
+groups, and the materialized view rolls those rows up by prefix:
+
+    CREATE TABLE mv_input_table (id BIGINT NOT NULL, prefix TEXT NOT NULL);
+    CREATE MATERIALIZED VIEW mv_input_count AS
+        SELECT prefix, COUNT(*)::BIGINT AS row_count
+        FROM mv_input_table
+        GROUP BY prefix;
+
+Defining the MV on the local coordinator's table (rather than a Kafka
+source) deliberately tests the end-to-end path independent of source
+ingestion: dataflow rendering, persist write of the MV output, and
+frontier advancement through compute. Source-side faults are still
+exercised because the workload runs under the same fault-injection regime
+as everything else.
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+
+from helper_pg import execute_retry
+
+LOG = logging.getLogger("antithesis.helper_table_mv")
+
+CLUSTER = os.environ.get("MZ_ANTITHESIS_CLUSTER", "antithesis_cluster")
+
+TABLE_MV_INPUT = "mv_input_table"
+MV_NAME = "mv_input_count"
+
+
+def ensure_table_and_mv() -> None:
+    """Create the input table and the materialized view if absent.
+
+    Both DDLs use IF NOT EXISTS so concurrent driver instances racing
+    through setup do not collide. The MV is created in the antithesis
+    cluster so dataflow execution is colocated with the rest of the
+    workload's compute.
+    """
+    execute_retry(
+        f"CREATE TABLE IF NOT EXISTS {TABLE_MV_INPUT} "
+        f"(id BIGINT NOT NULL, prefix TEXT NOT NULL)"
+    )
+    execute_retry(
+        f"CREATE MATERIALIZED VIEW IF NOT EXISTS {MV_NAME} "
+        f"IN CLUSTER {CLUSTER} AS "
+        f"SELECT prefix, COUNT(*)::BIGINT AS row_count "
+        f"FROM {TABLE_MV_INPUT} "
+        f"GROUP BY prefix"
+    )
+    LOG.info("table %s and MV %s ready", TABLE_MV_INPUT, MV_NAME)
diff --git a/test/antithesis/workload/test/parallel_driver_mv_reflects_table_updates.py b/test/antithesis/workload/test/parallel_driver_mv_reflects_table_updates.py
new file mode 100755
index 0000000000000..c026be09ea522
--- /dev/null
+++ b/test/antithesis/workload/test/parallel_driver_mv_reflects_table_updates.py
@@ -0,0 +1,162 @@
+#!/usr/bin/env python3
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Antithesis driver for `mv-reflects-source-updates`.
+
+End-to-end user-visible property: after data is written to an upstream
+collection, materialized views that depend on that collection eventually
+reflect the new data. Materialize's headline value proposition.
+
+This driver uses a TABLE (not a Kafka source) so the property is exercised
+independent of source ingestion: the test path is INSERT -> coordinator
+group_commit -> persist write of the table -> MV's compute dataflow ->
+persist write of the MV output -> SELECT. Kafka-source-specific liveness
+is covered by the other Kafka-source drivers.
+
+Each invocation:
+  1. Ensures `mv_input_table` + materialized view `mv_input_count` exist.
+  2. Picks a per-invocation prefix so concurrent driver instances scope to
+     disjoint MV rows.
+  3. INSERTs N rows tagged with the prefix.
+  4. Requests an Antithesis quiet period and polls the MV until the count
+     for the prefix equals N.
+  5. Asserts:
+       - `always(...)` the MV count matches what was inserted (no over- or
+         under-counting after settle).
+       - `sometimes(...)` the catchup completed within the budget (the
+         liveness anchor — without this, the always check could be vacuous
+         on a slow-catchup invocation).
+
+This is a `parallel_driver_` — many concurrent instances exercise the MV
+without colliding because each invocation owns its prefix range.
+"""
+
+from __future__ import annotations
+
+import logging
+import sys
+import time
+
+import helper_random
+from helper_pg import execute_retry, query_one_retry
+from helper_quiet import request_quiet_period
+from helper_table_mv import MV_NAME, TABLE_MV_INPUT, ensure_table_and_mv
+
+from antithesis.assertions import always, sometimes
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
+)
+LOG = logging.getLogger("driver.mv_reflects_table_updates")
+
+INSERTS_PER_INVOCATION = 40
+QUIET_PERIOD_S = 20
+CATCHUP_TIMEOUT_S = 60.0
+CATCHUP_POLL_INTERVAL_S = 0.5
+
+
+def _mv_count_for_prefix(prefix: str) -> int | None:
+    """Return the row_count the MV currently reports for `prefix`, or None.
+
+    None means "no row exists for that prefix yet" — distinct from zero,
+    which the MV would not produce for the `count(*)`+`group by` shape (a
+    fully-deleted prefix would not appear at all).
+    """
+    row = query_one_retry(
+        f"SELECT row_count::bigint FROM {MV_NAME} WHERE prefix = %s",
+        (prefix,),
+    )
+    if row is None:
+        return None
+    return int(row[0])
+
+
+def main() -> int:
+    ensure_table_and_mv()
+
+    prefix = f"p{helper_random.random_u64():016x}"
+    LOG.info("mv driver starting; prefix=%s", prefix)
+
+    # Insert N rows tagged with the prefix. We batch into a single statement
+    # so the coordinator processes them as one group_commit, which keeps the
+    # workload-visible target offset for catchup well-defined (otherwise a
+    # mid-insert crash would split the row count and the MV would catch up
+    # to "some" count rather than exactly N).
+    placeholders = ", ".join(["(%s, %s)"] * INSERTS_PER_INVOCATION)
+    params: list[object] = []
+    for i in range(INSERTS_PER_INVOCATION):
+        params.extend([i, prefix])
+    execute_retry(
+        f"INSERT INTO {TABLE_MV_INPUT} (id, prefix) VALUES {placeholders}",
+        params,
+    )
+
+    request_quiet_period(QUIET_PERIOD_S)
+
+    # Poll the MV until the row_count for this prefix reaches N. The MV's
+    # `COUNT(*) GROUP BY prefix` shape means the row for this prefix may
+    # appear partially populated during the catchup window.
+    deadline = time.monotonic() + CATCHUP_TIMEOUT_S
+    observed = _mv_count_for_prefix(prefix)
+    while observed != INSERTS_PER_INVOCATION and time.monotonic() < deadline:
+        time.sleep(CATCHUP_POLL_INTERVAL_S)
+        observed = _mv_count_for_prefix(prefix)
+
+    caught_up = observed == INSERTS_PER_INVOCATION
+
+    sometimes(
+        caught_up,
+        "mv: row_count caught up to inserted count after quiet period",
+        {
+            "mv": MV_NAME,
+            "table": TABLE_MV_INPUT,
+            "prefix": prefix,
+            "expected": INSERTS_PER_INVOCATION,
+            "observed": observed,
+        },
+    )
+
+    if not caught_up:
+        LOG.info(
+            "catchup did not complete in budget; skipping safety assertion "
+            "(observed=%s expected=%d)",
+            observed,
+            INSERTS_PER_INVOCATION,
+        )
+        return 0
+
+    # Safety check: the MV must report exactly the inserted count. A
+    # higher count would be double-counting (corruption); a lower count
+    # at this point would mean the catchup poll above gave us a stale
+    # read between observations, which is itself a correctness bug worth
+    # surfacing.
+    always(
+        observed == INSERTS_PER_INVOCATION,
+        "mv: row_count equals inserted count for prefix after settle",
+        {
+            "mv": MV_NAME,
+            "table": TABLE_MV_INPUT,
+            "prefix": prefix,
+            "expected": INSERTS_PER_INVOCATION,
+            "observed": observed,
+        },
+    )
+
+    LOG.info(
+        "mv driver done; inserted=%d mv_count=%s prefix=%s",
+        INSERTS_PER_INVOCATION,
+        observed,
+        prefix,
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From 7c026caeb4de1cf4c70602e7f4f5f0a8d3029db0 Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Tue, 12 May 2026 00:25:31 -0400
Subject: [PATCH 26/65] test/antithesis: persist-cas-monotonicity SUT anchor +
 strict-serializable-reads workload driver

---
 Cargo.lock                                    |   1 +
 src/persist-client/Cargo.toml                 |   1 +
 src/persist-client/src/internal/apply.rs      |  18 ++
 .../scratchbook/property-catalog.md           |   2 +
 ...rallel_driver_strict_serializable_reads.py | 225 ++++++++++++++++++
 5 files changed, 247 insertions(+)
 create mode 100755 test/antithesis/workload/test/parallel_driver_strict_serializable_reads.py

diff --git a/Cargo.lock b/Cargo.lock
index 2f4eed40b37c2..78cfc5d7ecd59 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -7203,6 +7203,7 @@ dependencies = [
 name = "mz-persist-client"
 version = "26.25.0-dev.0"
 dependencies = [
+ "antithesis_sdk",
  "anyhow",
  "arrayvec 0.7.6",
  "arrow",
diff --git a/src/persist-client/Cargo.toml b/src/persist-client/Cargo.toml
index 0fad73a172d71..0d2b068964372 100644
--- a/src/persist-client/Cargo.toml
+++ b/src/persist-client/Cargo.toml
@@ -28,6 +28,7 @@ name = "benches"
 harness = false
 
 [dependencies]
+antithesis_sdk.workspace = true
 anyhow.workspace = true
 arrayvec.workspace = true
 arrow.workspace = true
diff --git a/src/persist-client/src/internal/apply.rs b/src/persist-client/src/internal/apply.rs
index a48982ff77eb9..5085b24b3d6fb 100644
--- a/src/persist-client/src/internal/apply.rs
+++ b/src/persist-client/src/internal/apply.rs
@@ -15,6 +15,9 @@ use std::ops::ControlFlow::{self, Break, Continue};
 use std::sync::Arc;
 use std::time::Instant;
 
+use antithesis_sdk::assert_always_greater_than;
+use serde_json::json;
+
 use crate::cache::{LockingTypedState, StateCache};
 use crate::error::{CodecMismatch, InvalidUsage};
 use crate::internal::gc::GcReq;
@@ -598,6 +601,21 @@ where
             }
         }
 
+        // Antithesis-reportable form of the broader `persist-cas-monotonicity`
+        // catalog property: SeqNo must strictly increase across any committed
+        // state transition. The narrower equality check below (next == seqno)
+        // still panics on violation and stays in place to catch skip/regress
+        // in the same call.
+        assert_always_greater_than!(
+            new_state.seqno().0,
+            expected.0,
+            "persist: state seqno did not strictly increase across CaS apply",
+            &json!({
+                "expected_prev": expected.0,
+                "computed_next": new_state.seqno().0,
+                "cmd": cmd.name,
+            })
+        );
         assert_eq!(
             expected.next(),
             new_state.seqno(),
diff --git a/test/antithesis/scratchbook/property-catalog.md b/test/antithesis/scratchbook/property-catalog.md
index 8f3e2a2563d74..746796228d668 100644
--- a/test/antithesis/scratchbook/property-catalog.md
+++ b/test/antithesis/scratchbook/property-catalog.md
@@ -26,6 +26,7 @@ Properties that verify data correctness when crashes, network partitions, and co
 |---|---|
 | **Type** | Safety |
 | **Priority** | P0 — backbone of persist consistency; all other persist properties depend on this |
+| **Status** | **Implemented (SUT-side)** — `src/persist-client/src/internal/apply.rs`: alongside the existing `assert_eq!(expected.next(), new_state.seqno(), …)` strict-increment check in `compute_next_state_locked`, an `assert_always_greater_than!(new_state.seqno().0, expected.0, "persist: state seqno did not strictly increase across CaS apply", …)` makes the broader monotonicity invariant a reportable Antithesis property rather than only a process panic. The strict-equality `assert_eq!` is retained so the narrower invariant (next == seqno) still surfaces. The companion rollup-seqno invariant (`state.rs:1324` doc comment) is deferred. |
 | **Property** | Persist shard state versions (SeqNo) form a strictly increasing sequence. No writer can observe or apply a lower SeqNo after observing a higher one. |
 | **Invariant** | `Always`: for any shard, if SeqNo N is observed, no subsequent observation returns SeqNo < N. Rollups maintain seqno <= seqno_since. This must hold on every check — a single violation means state corruption. |
 | **Antithesis Angle** | Partition storage from persist backend mid-write. One writer races to increment SeqNo while another caches an old value and retries. Crash during GC/rollup operations. Antithesis explores interleaving of concurrent CaS loops. |
@@ -74,6 +75,7 @@ Properties that verify Materialize's strict serializability guarantee and timest
 |---|---|
 | **Type** | Safety |
 | **Priority** | P0 — Materialize's core advertised guarantee; user-visible |
+| **Status** | **Implemented (workload-side)** — `test/antithesis/workload/test/parallel_driver_strict_serializable_reads.py`. Inserts one row per step into `mv_input_table` and, between steps, opens a *fresh* psycopg connection (explicit `SET transaction_isolation TO 'strict serializable'`) to SELECT the rolling-count MV's row for the invocation's prefix. After a quiet-period closing observation, asserts (a) `always("…fresh-connection read regressed across adjacent observations", …)` for every adjacent pair, and (b) `always("…closing fresh-connection read regressed below earlier maximum", …)` for the closing read versus the historical max. One `sometimes("…final fresh-connection read reached inserted count", …)` liveness anchor. The SUT-side oracle-timestamp-non-decreasing mirror in `src/adapter/src/coord/in_memory_oracle.rs` is deferred. |
 | **Property** | Two reads on the same collection at timestamps t1 < t2 (assigned by the oracle) must observe consistent ordering: if t1 sees state S, t2 cannot observe a state prior to S. |
 | **Invariant** | `Always`: for any two reads where oracle assigns t1 < t2, the result at t2 must include all changes visible at t1. The oracle read timestamp must advance monotonically. |
 | **Antithesis Angle** | Run parallel transactions in StrictSerializable mode. One writes, another reads concurrently. Inject delays in oracle timestamp advancement. Antithesis explores whether reads can bypass the linearization point. |
diff --git a/test/antithesis/workload/test/parallel_driver_strict_serializable_reads.py b/test/antithesis/workload/test/parallel_driver_strict_serializable_reads.py
new file mode 100755
index 0000000000000..c4af73b434635
--- /dev/null
+++ b/test/antithesis/workload/test/parallel_driver_strict_serializable_reads.py
@@ -0,0 +1,225 @@
+#!/usr/bin/env python3
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Antithesis driver for `strict-serializable-reads`.
+
+Materialize's headline consistency guarantee: two reads on the same
+collection at oracle-assigned timestamps t1 < t2 must observe consistent
+ordering — anything visible at t1 must remain visible at t2. This driver
+exercises the cross-read half of that property: a sequence of fresh-
+connection reads against a materialized view, interleaved with writes,
+must yield a non-decreasing count.
+
+Approach:
+  1. Reuse `helper_table_mv` (table `mv_input_table` + MV `mv_input_count`)
+     so this driver does not introduce new schema. Each invocation owns a
+     fresh prefix so concurrent driver instances scope to disjoint rows.
+  2. For each step k = 1..N:
+       - INSERT one row tagged with the prefix in autocommit mode (each
+         insert is its own oracle-timestamped write).
+       - Open a *fresh* psycopg connection, set `transaction_isolation`
+         to `strict serializable` explicitly, and SELECT the MV's row
+         count for the prefix. Record (k, observed_count).
+       - Fresh connections are deliberate: a single long-lived connection
+         could mask a read-regression bug behind connection-local caching.
+  3. After all steps, run one more fresh-connection SELECT as the final
+     observation.
+  4. Assertions:
+       - `always(count[k+1] >= count[k], …)` between every adjacent pair
+         of recorded reads — the core strict-serializable read ordering
+         invariant.
+       - `always(final >= max(count), …)` for the closing observation.
+       - `sometimes(...)` liveness anchor confirming the closing
+         observation reached the inserted count after the quiet period.
+
+Read failures (connect timeout, server unavailable mid-fault) are skipped
+rather than recorded — they are not regression evidence, and a False
+positive on transient unavailability would obscure real bugs.
+
+This is a `parallel_driver_` — many concurrent instances run because the
+property is about read monotonicity *within* each client's observation
+stream, and prefix-scoping isolates each instance's expected count.
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+import sys
+import time
+
+import helper_random
+import psycopg
+from helper_pg import (
+    PGDATABASE,
+    PGHOST,
+    PGPORT,
+    PGUSER,
+    execute_retry,
+)
+from helper_quiet import request_quiet_period
+from helper_table_mv import MV_NAME, TABLE_MV_INPUT, ensure_table_and_mv
+
+from antithesis.assertions import always, sometimes
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
+)
+LOG = logging.getLogger("driver.strict_serializable_reads")
+
+STEPS_PER_INVOCATION = 12
+QUIET_PERIOD_S = 15
+FINAL_READ_TIMEOUT_S = 30.0
+FINAL_READ_POLL_S = 0.5
+PROBE_CONNECT_TIMEOUT_S = 5
+
+
+def _fresh_select_count(prefix: str) -> int | None:
+    """Open a *new* connection, force strict serializable, and SELECT the
+    MV's row_count for `prefix`. Returns None on any connect/query failure
+    so the caller can skip the observation without conflating fault-induced
+    unavailability with a read regression.
+
+    Setting `transaction_isolation` explicitly costs one extra round trip
+    but defends against future changes to the system default.
+    """
+    try:
+        with psycopg.connect(
+            host=PGHOST,
+            port=PGPORT,
+            user=PGUSER,
+            dbname=PGDATABASE,
+            connect_timeout=PROBE_CONNECT_TIMEOUT_S,
+            autocommit=True,
+        ) as conn, conn.cursor() as cur:
+            cur.execute("SET transaction_isolation TO 'strict serializable'")
+            cur.execute(
+                f"SELECT row_count::bigint FROM {MV_NAME} WHERE prefix = %s",
+                (prefix,),
+            )
+            row = cur.fetchone()
+    except Exception:  # noqa: BLE001
+        return None
+    if row is None:
+        return 0  # MV has no row for this prefix yet
+    return int(row[0])
+
+
+def main() -> int:
+    ensure_table_and_mv()
+
+    prefix = f"p{helper_random.random_u64():016x}"
+    LOG.info("strict-serializable driver starting; prefix=%s", prefix)
+
+    # Sequence of (step_index, observed_count). Reads that failed are
+    # represented as None and dropped before assertions.
+    observations: list[tuple[int, int]] = []
+
+    for step in range(1, STEPS_PER_INVOCATION + 1):
+        # Each INSERT is one autocommit write; the coordinator stamps it
+        # with an oracle timestamp. We INSERT before the read so the
+        # *expected* monotone behaviour is that every read is >= the
+        # previous one and the final read equals the total insert count
+        # (modulo catchup; covered by the liveness anchor below).
+        try:
+            execute_retry(
+                f"INSERT INTO {TABLE_MV_INPUT} (id, prefix) VALUES (%s, %s)",
+                (step, prefix),
+            )
+        except Exception as exc:  # noqa: BLE001
+            # Persistent insert failure under sustained fault — bail.
+            # Already-recorded observations are still valid evidence for
+            # the monotonicity assertion below.
+            LOG.info("step %d: insert failed (%s); ending step loop", step, exc)
+            break
+
+        observed = _fresh_select_count(prefix)
+        if observed is None:
+            # Fault-window read; skip. We do NOT record it so the
+            # adjacent-pair assertion below doesn't see a spurious zero.
+            continue
+        observations.append((step, observed))
+
+    # Settle and take the closing observation. The driver is short and the
+    # observations list is small, so a generous timeout here is fine.
+    request_quiet_period(QUIET_PERIOD_S)
+    expected_final = len(observations) and observations[-1][0]
+    # `expected_final` is the largest step that was actually INSERTed (we
+    # may have bailed early). It's an *upper bound* on the count — the
+    # final count may equal it (fully caught up) or be slightly less
+    # (catchup still in flight). The monotonicity assertion only cares
+    # that final >= every earlier observation.
+
+    deadline = time.monotonic() + FINAL_READ_TIMEOUT_S
+    final: int | None = _fresh_select_count(prefix)
+    while final is None and time.monotonic() < deadline:
+        time.sleep(FINAL_READ_POLL_S)
+        final = _fresh_select_count(prefix)
+
+    sometimes(
+        final is not None and final == expected_final,
+        "strict-serializable reads: final fresh-connection read reached inserted count",
+        {
+            "prefix": prefix,
+            "expected_final": expected_final,
+            "final_observed": final,
+            "observations": len(observations),
+        },
+    )
+
+    # ----- monotonicity: adjacent-pair assertion -----
+    # Across the recorded fresh-connection reads, no read may regress.
+    # This is the strict-serializable read-ordering property.
+    for i in range(1, len(observations)):
+        prev_step, prev_count = observations[i - 1]
+        curr_step, curr_count = observations[i]
+        always(
+            curr_count >= prev_count,
+            "strict-serializable reads: fresh-connection read regressed across adjacent observations",
+            {
+                "prefix": prefix,
+                "prev_step": prev_step,
+                "prev_count": prev_count,
+                "curr_step": curr_step,
+                "curr_count": curr_count,
+            },
+        )
+
+    # ----- monotonicity: closing observation dominates the maximum -----
+    # If the closing observation succeeded, it must be >= every earlier
+    # observation. (The final equality with `expected_final` is covered by
+    # the `sometimes` liveness anchor above and is not asserted here.)
+    if final is not None and observations:
+        max_observed = max(c for _, c in observations)
+        always(
+            final >= max_observed,
+            "strict-serializable reads: closing fresh-connection read regressed below earlier maximum",
+            {
+                "prefix": prefix,
+                "final": final,
+                "max_earlier": max_observed,
+            },
+        )
+
+    LOG.info(
+        "strict-serializable driver done; observations=%d final=%s expected_final=%s",
+        len(observations),
+        final,
+        expected_final,
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    # Touch the imported env constants so static analysis treats them as
+    # used; helper_pg re-exports them for drivers that bypass its retry
+    # helpers (as this one does for fresh connections).
+    _ = (PGHOST, PGPORT, PGUSER, PGDATABASE, os)
+    sys.exit(main())

From 06d90fbc6c53e0098671f44e8f244c3db773b962 Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Tue, 12 May 2026 00:40:20 -0400
Subject: [PATCH 27/65] =?UTF-8?q?test/antithesis:=20catalog=20cluster=20?=
 =?UTF-8?q?=E2=80=94=20partial=20epoch-fencing=20SUT=20anchor=20+=20catalo?=
 =?UTF-8?q?g-recovery-consistency=20workload=20driver?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 Cargo.lock                                    |   1 +
 src/catalog/Cargo.toml                        |   1 +
 src/catalog/src/durable/persist.rs            |  48 ++++
 .../scratchbook/property-catalog.md           |   2 +
 ...ton_driver_catalog_recovery_consistency.py | 240 ++++++++++++++++++
 5 files changed, 292 insertions(+)
 create mode 100755 test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py

diff --git a/Cargo.lock b/Cargo.lock
index 78cfc5d7ecd59..10cec38aca6fe 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5815,6 +5815,7 @@ dependencies = [
 name = "mz-catalog"
 version = "0.0.0"
 dependencies = [
+ "antithesis_sdk",
  "anyhow",
  "async-trait",
  "base64 0.22.1",
diff --git a/src/catalog/Cargo.toml b/src/catalog/Cargo.toml
index 6704bd79d8b06..3553217de30ed 100644
--- a/src/catalog/Cargo.toml
+++ b/src/catalog/Cargo.toml
@@ -10,6 +10,7 @@ publish = false
 workspace = true
 
 [dependencies]
+antithesis_sdk.workspace = true
 anyhow.workspace = true
 async-trait.workspace = true
 base64.workspace = true
diff --git a/src/catalog/src/durable/persist.rs b/src/catalog/src/durable/persist.rs
index c93830e38d7e3..83d560c98004c 100644
--- a/src/catalog/src/durable/persist.rs
+++ b/src/catalog/src/durable/persist.rs
@@ -17,6 +17,7 @@ use std::str::FromStr;
 use std::sync::{Arc, LazyLock};
 use std::time::{Duration, Instant};
 
+use antithesis_sdk::assert_always_greater_than;
 use async_trait::async_trait;
 use differential_dataflow::lattice::Lattice;
 use futures::{FutureExt, StreamExt};
@@ -41,6 +42,7 @@ use mz_repr::Diff;
 use mz_storage_client::controller::PersistEpoch;
 use mz_storage_types::StorageDiff;
 use mz_storage_types::sources::SourceData;
+use serde_json::json;
 use sha2::Digest;
 use timely::progress::{Antichain, Timestamp as TimelyTimestamp};
 use tracing::{debug, info, warn};
@@ -145,6 +147,21 @@ impl FenceableToken {
                 current_token,
                 fence_token,
             } => {
+                // The two `assert!` calls below are the natural placement
+                // for an Antithesis `assert_always!` covering the
+                // FenceableToken state-machine invariant. They are not
+                // wrapped today because Materialize does not run multiple
+                // concurrent environmentd processes against the same
+                // catalog shard, so the `Fenced` state is unreachable in
+                // every supported topology — including the Antithesis
+                // topology in this repo. Wrapping them would create
+                // assertions Antithesis cannot exercise, which is dead
+                // weight in coverage reports. If we ever ship multi-
+                // environmentd (e.g. for a 0DT-preflight Antithesis run),
+                // convert these to `assert_always!` with distinct
+                // messages so a violation becomes a reportable property
+                // failure rather than a panic. See the
+                // `epoch-fencing-prevents-split-brain` catalog entry.
                 assert!(
                     fence_token > current_token,
                     "must be fenced by higher token; current={current_token:?}, fence={fence_token:?}"
@@ -1182,12 +1199,43 @@ impl UnopenedPersistCatalogState {
                 "fencing previous catalogs"
             );
             if matches!(self.mode, Mode::Writable) {
+                // Snapshot the prior durable epoch so the post-CaS anchor
+                // below can verify monotonicity. Captured before the write
+                // because `compare_and_append` may call `sync()` which
+                // reads new state into `self.fenceable_token`.
+                let prior_durable_epoch = self
+                    .fenceable_token
+                    .token()
+                    .map(|t| t.epoch.get())
+                    .unwrap_or(0);
                 match self
                     .compare_and_append(fence_updates.clone(), commit_ts)
                     .await
                 {
                     Ok(upper) => {
                         commit_ts = upper;
+                        // Antithesis anchor for `epoch-fencing-prevents-
+                        // split-brain`: after our fence-token CaS commits,
+                        // the freshly-minted epoch we just persisted must
+                        // be strictly greater than the prior durable
+                        // epoch. A regression here would mean a future
+                        // lower-epoch writer would not be fenced out by
+                        // the write we just made, opening the split-brain
+                        // window the catalog is supposed to close.
+                        let new_epoch = current_fenceable_token
+                            .token()
+                            .expect("freshly minted Unfenced token always has a current_token")
+                            .epoch
+                            .get();
+                        assert_always_greater_than!(
+                            new_epoch,
+                            prior_durable_epoch,
+                            "catalog fencing: new durable epoch did not strictly increase after fence-token CaS",
+                            &json!({
+                                "prior_durable_epoch": prior_durable_epoch,
+                                "new_epoch": new_epoch,
+                            })
+                        );
                     }
                     Err(CompareAndAppendError::Fence(e)) => return Err(e.into()),
                     Err(e @ CompareAndAppendError::UpperMismatch { .. }) => {
diff --git a/test/antithesis/scratchbook/property-catalog.md b/test/antithesis/scratchbook/property-catalog.md
index 746796228d668..93c1380929881 100644
--- a/test/antithesis/scratchbook/property-catalog.md
+++ b/test/antithesis/scratchbook/property-catalog.md
@@ -15,6 +15,7 @@ Properties that verify data correctness when crashes, network partitions, and co
 |---|---|
 | **Type** | Safety |
 | **Priority** | P0 — fundamental split-brain prevention; failure here corrupts all state |
+| **Status** | **Partially implemented (SUT-side, single-coordinator scope)** — `src/catalog/src/durable/persist.rs`: an `assert_always_greater_than!(new_epoch, prior_durable_epoch, "catalog fencing: new durable epoch did not strictly increase after fence-token CaS", …)` fires after each successful fence-token CaS in `open_inner`. Every environmentd restart in the Antithesis topology exercises this path. **The cross-coordinator half of the property (a `Fenced` writer being correctly rejected at validate time) is NOT exercised today and is not planned.** Materialize does not run multiple concurrent environmentd processes against the same catalog shard in any supported topology, so the `FenceableToken::Fenced` state is unreachable here. The two `assert!` panics in `FenceableToken::validate` would be the natural Antithesis anchor for that half; they are intentionally left as bare panics with an in-source comment pointing back to this entry, to be promoted to `assert_always!` if a 0DT-preflight-style multi-environmentd topology is ever added. |
 | **Property** | After a coordinator restart with a higher epoch, the old coordinator (lower epoch) cannot successfully write to the catalog persist shard. |
 | **Invariant** | `Always`: once a higher epoch is written to consensus, any compare_and_append from a lower epoch must fail with FenceError. This is a strict safety invariant — every check must hold. |
 | **Antithesis Angle** | Network partition separates old coordinator from consensus while new coordinator starts with higher epoch. When partition heals, old coordinator's in-flight writes must be rejected. Antithesis explores the timing window between old coordinator's last successful write and new coordinator's first write. |
@@ -87,6 +88,7 @@ Properties that verify Materialize's strict serializability guarantee and timest
 |---|---|
 | **Type** | Safety |
 | **Priority** | P1 — catalog corruption on recovery prevents system from starting |
+| **Status** | **Implemented (workload-side)** — `test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py`. Long-running singleton driver holds an in-process `expected_tables` set across cycles. Each cycle runs one CREATE TABLE or DROP TABLE under `execute_retry`, then opens a *fresh* psycopg connection and SELECTs `mz_tables` filtered to the driver's namespace, asserting `always("catalog recovery: live catalog table set matches in-process expected model", …)`. Cross-cycle stability is exactly the recovery check: if an environmentd restart lands between cycles, the next cycle's read is the post-recovery snapshot. Two `sometimes(...)` anchors record (a) "2+ assertion cycles ran" so the post-restart half is exercised, and (b) "observed environmentd connect failure during run" as a corroborating signal that a fault actually landed. The SUT-side upper-non-regression mirror in `sync_to_current_upper` and the consolidation `assert_always!` are deferred. |
 | **Property** | After coordinator crash and restart, the recovered catalog state is equivalent to the pre-crash state: upper never decreases, snapshot is consolidated, and all committed transactions are visible. |
 | **Invariant** | `Always`: upper(post_restart) >= upper(pre_crash). After sync_to_current_upper(), the snapshot contains no unconsolidated entries (all diffs resolved). |
 | **Antithesis Angle** | Crash coordinator during catalog_transact (after some updates persist but before upper advances). Crash during consolidation. Antithesis explores the timing of crashes within the catalog write path. |
diff --git a/test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py b/test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py
new file mode 100755
index 0000000000000..59385a59a7ac7
--- /dev/null
+++ b/test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py
@@ -0,0 +1,240 @@
+#!/usr/bin/env python3
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Antithesis driver for `catalog-recovery-consistency`.
+
+After environmentd crashes and restarts, the catalog state must be
+consistent with what was committed pre-crash: every previously-acknowledged
+DDL operation must remain visible, and the catalog upper must not regress.
+The user-visible form of this property is: "if I created a table and
+received an OK, the table is still there after a restart."
+
+Approach mirrors `singleton_driver_upsert_state_rehydration.py`:
+  - One `singleton_driver_` per timeline, long enough to span multiple
+    Antithesis-injected environmentd restarts.
+  - In-process `expected_tables: set[str]` model holds the authoritative
+    "what should be in the catalog right now" view.
+  - Per cycle, do some DDL (CREATE TABLE or DROP TABLE), then open a
+    *fresh* psycopg connection and SELECT from `mz_tables` scoped to the
+    driver's namespace, asserting the live catalog matches `expected`.
+  - Cross-cycle stability is the recovery check: if an environmentd
+    restart lands between cycle N and cycle N+1, cycle N+1's read is the
+    post-recovery snapshot and the assertion catches any lost or stuck
+    DDL.
+
+`helper_pg.execute_retry` retries OperationalError transparently, so when
+environmentd is down mid-DDL the call will block-and-retry until the next
+incarnation is reachable. That's exactly the timing we want: the DDL
+either committed pre-crash (in which case it must reappear post-recovery)
+or never committed (in which case we record it failed and update the
+local model). When the retry budget elapses before recovery, we abandon
+that cycle's DDL without updating the local model — fault windows
+exceeding the budget are *not* property failures.
+
+Two corroborating `sometimes(...)` anchors record (a) whether the driver
+observed a coord-side connect failure during its run, and (b) whether at
+least two assertion-bearing cycles ran (so the assertion at cycle N+1
+genuinely reads post-restart state, not just the same state as N).
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+import sys
+import time
+
+import helper_random
+import psycopg
+from helper_pg import (
+    PGDATABASE,
+    PGHOST,
+    PGPORT,
+    PGUSER,
+    execute_retry,
+    query_retry,
+)
+
+from antithesis.assertions import always, sometimes
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
+)
+LOG = logging.getLogger("driver.catalog_recovery_consistency")
+
+# Long-running knobs: the driver owns its timeline and the per-cycle budget
+# has to comfortably exceed environmentd's restart time so a fault landing
+# mid-DDL still resolves before the next cycle. CYCLE_COUNT high enough to
+# give Antithesis multiple windows to land a restart between cycles.
+CYCLE_COUNT = 10
+DROP_PROBABILITY = 0.20
+INTER_CYCLE_SLEEP_S = 2.0
+
+PROBE_CONNECT_TIMEOUT_S = 2.0
+
+
+def _fresh_observed_tables(name_prefix: str) -> set[str] | None:
+    """Open a new connection and SELECT mz_tables filtered to `name_prefix`.
+
+    Returns the set of observed table names on success, or `None` on any
+    connect/query failure. None lets the caller skip the cycle's assertion
+    rather than blaming the property for a fault-window read.
+    """
+    try:
+        with psycopg.connect(
+            host=PGHOST,
+            port=PGPORT,
+            user=PGUSER,
+            dbname=PGDATABASE,
+            connect_timeout=int(PROBE_CONNECT_TIMEOUT_S),
+            autocommit=True,
+        ) as conn, conn.cursor() as cur:
+            cur.execute(
+                "SELECT name FROM mz_tables WHERE name LIKE %s",
+                (f"{name_prefix}%",),
+            )
+            return {row[0] for row in cur.fetchall()}
+    except Exception:  # noqa: BLE001
+        return None
+
+
+def _saw_coord_unavailable() -> bool:
+    """Best-effort one-shot probe with the same short connect timeout as
+    the assertion reads. A failure here means a coord-side connection was
+    refused or timed out within the last ~tick — a strong proxy for
+    "environmentd is down or just restarted." This is corroborating signal
+    only; it does not gate the safety assertion.
+    """
+    try:
+        with psycopg.connect(
+            host=PGHOST,
+            port=PGPORT,
+            user=PGUSER,
+            dbname=PGDATABASE,
+            connect_timeout=int(PROBE_CONNECT_TIMEOUT_S),
+            autocommit=True,
+        ) as _conn:
+            pass
+        return False
+    except Exception:  # noqa: BLE001
+        return True
+
+
+def _run_cycle(
+    expected: set[str],
+    name_prefix: str,
+    cycle_idx: int,
+    next_id: int,
+) -> tuple[bool, int]:
+    """One create-or-drop + verify cycle.
+
+    Returns (assertions_ran, next_id_after) where `assertions_ran` is True
+    iff this cycle landed a successful post-DDL read against a fresh
+    connection (i.e. the cycle contributes to the safety property). The
+    `next_id` counter is monotonic across cycles so table names are unique
+    even after drops.
+
+    The DDL is run via `execute_retry`, which already retries transient
+    OperationalError until the retry budget. If it raises anyway the
+    cycle aborts and the local model is not updated — exactly the
+    semantics needed: a DDL we never acknowledged is allowed to be
+    missing from the post-recovery catalog.
+    """
+    new_id = next_id
+    if expected and helper_random.random_bool(DROP_PROBABILITY):
+        # Drop a random existing table. Choosing from `expected` keeps the
+        # drop deterministic w.r.t. the local model.
+        table = sorted(expected)[helper_random.random_int(0, len(expected) - 1)]
+        try:
+            execute_retry(f"DROP TABLE {table}")
+        except Exception as exc:  # noqa: BLE001
+            LOG.info("cycle %d: DROP %s failed (%s); not updating model", cycle_idx, table, exc)
+            return False, new_id
+        expected.discard(table)
+    else:
+        table = f"{name_prefix}_t{new_id:06d}"
+        try:
+            execute_retry(f"CREATE TABLE {table} (id BIGINT NOT NULL)")
+        except Exception as exc:  # noqa: BLE001
+            LOG.info("cycle %d: CREATE %s failed (%s); not updating model", cycle_idx, table, exc)
+            return False, new_id
+        expected.add(table)
+        new_id += 1
+
+    # Verify via a fresh connection. If this read fails, we skip the
+    # assertion — a fault-window read is not regression evidence.
+    observed = _fresh_observed_tables(name_prefix)
+    if observed is None:
+        LOG.info("cycle %d: fresh-connection read failed; skipping assertion", cycle_idx)
+        return False, new_id
+
+    always(
+        observed == expected,
+        "catalog recovery: live catalog table set matches in-process expected model",
+        {
+            "cycle": cycle_idx,
+            "name_prefix": name_prefix,
+            "expected_count": len(expected),
+            "observed_count": len(observed),
+            # Cap the explicit diffs so the assertion details stay compact
+            # even on a large divergence.
+            "missing_from_catalog": sorted(expected - observed)[:5],
+            "unexpected_in_catalog": sorted(observed - expected)[:5],
+        },
+    )
+    return True, new_id
+
+
+def main() -> int:
+    # Per-timeline namespace so concurrent timelines and any future
+    # parallel_driver_ instances do not collide on table names.
+    name_prefix = f"catrec_{helper_random.random_u64():016x}"
+    LOG.info("catalog recovery driver starting; name_prefix=%s", name_prefix)
+
+    expected: set[str] = set()
+    next_id = 0
+    cycles_ran = 0
+    saw_coord_unavailable = False
+
+    for cycle_idx in range(CYCLE_COUNT):
+        ran, next_id = _run_cycle(expected, name_prefix, cycle_idx, next_id)
+        if ran:
+            cycles_ran += 1
+        if _saw_coord_unavailable():
+            saw_coord_unavailable = True
+        time.sleep(INTER_CYCLE_SLEEP_S)
+
+    sometimes(
+        cycles_ran >= 2,
+        "catalog recovery: 2+ assertion-bearing cycles ran in this timeline",
+        {"cycles_ran": cycles_ran, "cycles_planned": CYCLE_COUNT},
+    )
+    sometimes(
+        saw_coord_unavailable,
+        "catalog recovery: observed environmentd connect failure during run",
+        {"cycles_ran": cycles_ran, "saw_coord_unavailable": saw_coord_unavailable},
+    )
+
+    LOG.info(
+        "catalog recovery driver done; cycles_ran=%d/%d expected_size=%d saw_coord_unavailable=%s",
+        cycles_ran,
+        CYCLE_COUNT,
+        len(expected),
+        saw_coord_unavailable,
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    # Touch helper_pg env constants so static analysis treats them as
+    # used; the helper module re-exports them for drivers (like this one)
+    # that open their own connections.
+    _ = (PGHOST, PGPORT, PGUSER, PGDATABASE, os, query_retry)
+    sys.exit(main())

From 3b9bac51e01d8481f1dcc115f1a96da264d24eed Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Tue, 12 May 2026 00:54:58 -0400
Subject: [PATCH 28/65] test/antithesis: drop unfireable rehydration anchor;
 bump pg client timeouts; remove dead upsert.rs (classic) antithesis asserts

---
 src/storage/src/upsert.rs                     | 15 ------
 .../scratchbook/property-catalog.md           |  4 +-
 test/antithesis/workload/test/helper_pg.py    | 12 ++++-
 ...ngleton_driver_upsert_state_rehydration.py | 53 +++++++------------
 4 files changed, 30 insertions(+), 54 deletions(-)

diff --git a/src/storage/src/upsert.rs b/src/storage/src/upsert.rs
index 5c8922de4c022..cdc583d76b119 100644
--- a/src/storage/src/upsert.rs
+++ b/src/storage/src/upsert.rs
@@ -15,7 +15,6 @@ use std::hash::{Hash, Hasher};
 use std::path::PathBuf;
 use std::sync::Arc;
 
-use antithesis_sdk::{assert_always, assert_unreachable};
 use differential_dataflow::hashable::Hashable;
 use differential_dataflow::{AsCollection, VecCollection};
 use futures::StreamExt;
@@ -35,7 +34,6 @@ use mz_timely_util::builder_async::{
     PressOnDropButton,
 };
 use serde::{Deserialize, Serialize};
-use serde_json::json;
 use sha2::{Digest, Sha256};
 use timely::dataflow::channels::pact::Exchange;
 use timely::dataflow::operators::{Capability, InputCapability, Operator};
@@ -540,11 +538,6 @@ fn stage_input<T, FromTime>(
     }
 
     stash.extend(data.drain(..).map(|((key, value, order), time, diff)| {
-        assert_always!(
-            diff.is_positive(),
-            "upsert: input diff positive (classic)",
-            &json!({"diff": diff.into_inner()})
-        );
         assert!(diff.is_positive(), "invalid upsert input");
         (time, key, Reverse(order), value)
     }));
@@ -640,10 +633,6 @@ async fn drain_staged_input<S, T, FromTime, E>(
         let mut command_state = if let Entry::Occupied(command_state) = commands_state.entry(key) {
             command_state
         } else {
-            assert_unreachable!(
-                "upsert: key missing from commands_state (classic)",
-                &json!({"source_id": source_config.id.to_string()})
-            );
             panic!("key missing from commands_state");
         };
 
@@ -1039,9 +1028,5 @@ async fn process_upsert_state_error<T: Timestamp>(
     let update = HealthStatusUpdate::halting(e.context(context).to_string_with_causes(), None);
     health_output.give(health_cap, (None, update));
     std::future::pending::<()>().await;
-    assert_unreachable!(
-        "upsert: pending future returned (classic)",
-        &json!({"site": "process_upsert_state_error"})
-    );
     unreachable!("pending future never returns");
 }
diff --git a/test/antithesis/scratchbook/property-catalog.md b/test/antithesis/scratchbook/property-catalog.md
index 93c1380929881..b09ceb0a00e7c 100644
--- a/test/antithesis/scratchbook/property-catalog.md
+++ b/test/antithesis/scratchbook/property-catalog.md
@@ -339,8 +339,8 @@ Properties specific to the Kafka source ingestion pipeline: `KafkaSourceReader`
 |---|---|
 | **Type** | Reachability (Unreachable) |
 | **Priority** | P1 — these panics are explicit "should-never-happen" guards that bug history has hit |
-| **Status** | **Implemented (SUT-side)** — every targeted site in `src/storage/src/upsert.rs` (stash diff-positive, `commands_state` missing key, `process_upsert_state_error` pending-future guard), `upsert_continual_feedback.rs` (stash diff-positive, `commands_state` missing key), `upsert_continual_feedback_v2.rs` (input diff-positive, `(None, None)` join), and `upsert/types.rs` (`merge_update_state` non-Consolidating, double snapshot completion) gets a uniquely-messaged `assert_always!`/`assert_unreachable!` paired with the original `panic!`/`assert!`. Panics still terminate the process; Antithesis now also receives a reportable property failure with rich details. |
-| **Property** | The explicit panics and `assert!`s in the upsert operator never fire under any Antithesis-injected fault sequence. Specifically: `assert!(diff.is_positive(), "invalid upsert input")` (upsert.rs:541, upsert_continual_feedback.rs:626, v2:315); `panic!("key missing from commands_state")` (upsert.rs:636, upsert_continual_feedback.rs:800); `unreachable!()` for `(None, None)` in continual-feedback v2 (v2:483); the order-key panic that used to live in `drain_staged_input` (now a skip; commit f177db8286). |
+| **Status** | **Implemented (SUT-side, reachable sites only)** — every targeted *reachable* site has a uniquely-messaged `assert_always!`/`assert_unreachable!` paired with the original `panic!`/`assert!`: `upsert_continual_feedback.rs` (stash diff-positive, `commands_state` missing key), `upsert_continual_feedback_v2.rs` (input diff-positive, `(None, None)` join), and `upsert/types.rs` (`merge_update_state` non-Consolidating, double snapshot completion). The mirror sites in `src/storage/src/upsert.rs` (classic) were dropped: `upsert_operator` hard-codes `use_continual_feedback_upsert = true` (commit a63d1763e5, Feb 2025), so the classic-upsert code is provably unreachable in supported configurations and Antithesis-instrumenting it added dead-weight assertions. Panics still terminate the process; Antithesis receives a reportable property failure with rich details for every reachable site. |
+| **Property** | The explicit panics and `assert!`s in the upsert operator never fire under any Antithesis-injected fault sequence. Specifically (reachable sites): `assert!(diff.is_positive(), "invalid upsert input")` (upsert_continual_feedback.rs:626, v2:315); `panic!("key missing from commands_state")` (upsert_continual_feedback.rs:800); `unreachable!()` for `(None, None)` in continual-feedback v2 (v2:483); the order-key panic that used to live in `drain_staged_input` (now a skip; commit f177db8286). |
 | **Invariant** | `Unreachable`: each of these sites is converted to an Antithesis `assert_unreachable!("…")` (or `assert_always!(false, …)`) so that any firing produces an explicit Antithesis property failure rather than a process crash. Distinct, unique message per site. |
 | **Antithesis Angle** | These are the high-signal SUT-side anchors. They catch the same family of bugs that historically reached production: order-key regression, missing dedup entry, retraction-on-input. Adding them costs almost nothing in the SUT and gives Antithesis precise replay anchors. |
 | **Why It Matters** | These panics indicate the operator entered an internal state its author thought was impossible. Past bugs (commits f177db8286, 1accbe28b3) reached production exactly through these paths. The asserts already exist; we just need to wrap them with the Antithesis SDK so the failures become reportable properties rather than process kills. |
diff --git a/test/antithesis/workload/test/helper_pg.py b/test/antithesis/workload/test/helper_pg.py
index d336905b1914b..5c74276fe5f90 100644
--- a/test/antithesis/workload/test/helper_pg.py
+++ b/test/antithesis/workload/test/helper_pg.py
@@ -37,8 +37,16 @@
 
 # Retry tuning. Antithesis injects partitions and node hangs; conservative bounds
 # keep drivers progressing without masking real correctness signals.
-_CONNECT_TIMEOUT_S = 5
-_RETRY_BUDGET_S = 60
+#
+# These need to absorb a full Antithesis quiet period plus restart time for the
+# system to come back. Quiet-period requests in the workload are typically
+# 20-25s; the container then takes a few seconds to become responsive, so the
+# overall budget must comfortably exceed ~30s. The per-attempt connect timeout
+# also has to be long enough to actually complete a TCP+TLS handshake against
+# a hung but recovering materialized — too short and every attempt fails fast
+# and the budget is burned without giving the system a chance to answer.
+_CONNECT_TIMEOUT_S = 15
+_RETRY_BUDGET_S = 120
 _RETRY_INITIAL_S = 0.1
 _RETRY_MAX_S = 2.0
 
diff --git a/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py b/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py
index 5c41c406f3210..5f3c13bcdce57 100755
--- a/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py
+++ b/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py
@@ -34,10 +34,19 @@
 
 The driver also records one `sometimes` anchor confirming that at least
 two assertion-bearing cycles ran (without this, the safety check could be
-vacuously satisfied by a single early settle), and a second anchor
-confirming clusterd was observed unavailable between cycles (best-effort
-proxy for "restart happened" — the helper_pg retry budget makes connect
-errors very rare under normal operation).
+vacuously satisfied by a single early settle).
+
+A previous version of this driver also recorded a "clusterd observed
+non-online" `sometimes` anchor via a once-per-cycle SELECT of
+`mz_internal.mz_cluster_replica_statuses`. That assertion was structurally
+unable to fire here: each cycle requests a 25-second Antithesis quiet
+period before its assertions, the probe runs *after* the quiet period
+(when faults are paused and killed containers have been restored), and
+the introspection view itself lags clusterd death by the
+orchestrator-process 5-second poll. The "did we see a replica go
+offline" signal lives in `anytime_fault_recovery_exercised.py` instead,
+which polls continuously and never requests a quiet period, so it has
+the right shape to observe the offline window.
 
 Distinct prefix per timeline keeps multiple parallel timelines independent.
 """
@@ -102,29 +111,6 @@ def _select_value_for_key(key: str) -> tuple[bool, str | None]:
     return True, value
 
 
-def _saw_clusterd_unavailable() -> bool:
-    """Best-effort probe: does `mz_internal.mz_cluster_replica_statuses` show
-    any `antithesis_cluster` replica with `status != 'online'` right now?
-    The status column reports `online` or `offline`. Catching `offline`
-    in a snapshot doesn't *prove* a restart happened (we may have missed
-    a transient flap entirely), but it's a noisy yes-signal that something
-    disturbed the cluster during the cycle.
-    """
-    try:
-        row = query_one_retry("""
-            SELECT EXISTS (
-                SELECT 1
-                FROM mz_internal.mz_cluster_replica_statuses s
-                JOIN mz_cluster_replicas r ON r.id = s.replica_id
-                JOIN mz_clusters c ON c.id = r.cluster_id
-                WHERE c.name = 'antithesis_cluster' AND s.status != 'online'
-            )
-            """)
-    except Exception:  # noqa: BLE001
-        return False
-    return bool(row and row[0])
-
-
 def _run_cycle(
     producer, tracker, expected: dict[str, str | None], cycle_idx: int
 ) -> bool:
@@ -220,25 +206,22 @@ def main() -> int:
     expected: dict[str, str | None] = {}
 
     cycles_run = 0
-    saw_replica_unavailable = False
 
     for cycle_idx in range(CYCLE_COUNT):
         if _run_cycle(producer, tracker, expected, cycle_idx):
             cycles_run += 1
-        if _saw_clusterd_unavailable():
-            saw_replica_unavailable = True
         time.sleep(INTER_CYCLE_SLEEP_S)
 
+    # The "did this run actually span a clusterd restart" anchor is
+    # deliberately not in this driver — see the module docstring. The
+    # `cycles_run >= 2` check below is the rehydration-coverage anchor:
+    # without two post-quiet-period reads, the safety assertions could
+    # be vacuously satisfied by a single early settle.
     sometimes(
         cycles_run >= 2,
         "upsert: rehydration driver ran 2+ assertion cycles",
         {"cycles_run": cycles_run, "cycles_planned": CYCLE_COUNT},
     )
-    sometimes(
-        saw_replica_unavailable,
-        "upsert: rehydration driver observed clusterd replica non-online",
-        {"cycles_run": cycles_run},
-    )
 
     LOG.info("rehydration driver done; %d/%d cycles ran", cycles_run, CYCLE_COUNT)
     return 0

From 4366c9e74eef6a1609ccad00fa0de879635d380d Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Tue, 12 May 2026 01:06:24 -0400
Subject: [PATCH 29/65] test/antithesis: add second clusterd replica to
 antithesis_cluster for multi-replica fault coverage

---
 test/antithesis/config/docker-compose.yaml    | 41 +++++++++++++++++++
 test/antithesis/mzcompose.py                  | 16 +++++++-
 .../workload/workload-entrypoint.sh           | 39 ++++++++++++------
 3 files changed, 82 insertions(+), 14 deletions(-)

diff --git a/test/antithesis/config/docker-compose.yaml b/test/antithesis/config/docker-compose.yaml
index 73291200a043c..05d5e1cc27297 100644
--- a/test/antithesis/config/docker-compose.yaml
+++ b/test/antithesis/config/docker-compose.yaml
@@ -210,6 +210,45 @@ services:
     stop_grace_period: 120s
     platform: linux/amd64
     image: ${MATERIALIZED_IMAGE}
+  clusterd2:
+    entrypoint:
+    - tini
+    - --
+    command:
+    - clusterd
+    - --scratch-directory=/scratch
+    ports:
+    - 2100
+    - 2101
+    - 6878
+    environment:
+    - CLUSTERD_GRPC_HOST=clusterd2
+    - CLUSTERD_USE_CTP=true
+    - MZ_SOFT_ASSERTIONS=1
+    - CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2100
+    - CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2101
+    - CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=0.0.0.0:6878
+    - CLUSTERD_SECRETS_READER=local-file
+    - CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=/mzdata/secrets
+    - LD_PRELOAD=libeatmydata.so
+    - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879
+    - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0
+    - CLUSTERD_PROCESS=0
+    - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 1, "process": 0, "addresses": ["clusterd2:2102"],
+      "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
+      false, "zero_copy_limit": null}'
+    - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 1, "process": 0, "addresses": ["clusterd2:2103"],
+      "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
+      false, "zero_copy_limit": null}'
+    volumes:
+    - mzdata:/mzdata
+    - mydata:/var/lib/mysql-files
+    - tmp:/share/tmp
+    - scratch:/scratch
+    restart: 'no'
+    stop_grace_period: 120s
+    platform: linux/amd64
+    image: ${MATERIALIZED_IMAGE}
   materialized:
     hostname: materialized
     depends_on:
@@ -404,6 +443,8 @@ services:
         condition: service_healthy
       clusterd1:
         condition: service_started
+      clusterd2:
+        condition: service_started
       kafka:
         condition: service_healthy
       schema-registry:
diff --git a/test/antithesis/mzcompose.py b/test/antithesis/mzcompose.py
index 552dd1d21e824..d4b31841da46f 100644
--- a/test/antithesis/mzcompose.py
+++ b/test/antithesis/mzcompose.py
@@ -15,8 +15,11 @@
   - minio             : S3-compatible blob storage for persist
   - zookeeper + kafka : Kafka broker for source ingestion
   - schema-registry   : Avro/Protobuf schemas for kafka sources
-  - clusterd1         : external compute+storage process — fenceable
-                        independently of materialized for fault testing
+  - clusterd1, clusterd2 : two external compute+storage processes — each
+                        backs one replica of `antithesis_cluster`, so
+                        Antithesis killing either container exercises the
+                        compute/storage-replica recovery and rebalancing
+                        paths without taking the cluster offline.
   - materialized      : the SUT (environmentd; clusterd is external)
   - workload          : Python test driver wired to the Antithesis SDK
 
@@ -45,6 +48,7 @@ def __init__(self) -> None:
             "depends_on": {
                 "materialized": {"condition": "service_healthy"},
                 "clusterd1": {"condition": "service_started"},
+                "clusterd2": {"condition": "service_started"},
                 "kafka": {"condition": "service_healthy"},
                 "schema-registry": {"condition": "service_started"},
             },
@@ -71,7 +75,14 @@ def __init__(self) -> None:
     Zookeeper(),
     Kafka(auto_create_topics=True),
     SchemaRegistry(),
+    # Two clusterd processes, one per replica of the unmanaged
+    # `antithesis_cluster`. Provisioning both replicas in the same cluster
+    # exercises multi-replica source ingestion and compute paths
+    # (notably the `compute-replica-epoch-isolation` property), and lets
+    # Antithesis kill either replica's backing container without taking
+    # the workload offline.
     Clusterd(name="clusterd1"),
+    Clusterd(name="clusterd2"),
     Materialized(
         external_blob_store=True,
         external_metadata_store=True,
@@ -99,6 +110,7 @@ def workflow_default(c: Composition) -> None:
         "kafka",
         "schema-registry",
         "clusterd1",
+        "clusterd2",
     )
     c.up("materialized")
     c.up("workload")
diff --git a/test/antithesis/workload/workload-entrypoint.sh b/test/antithesis/workload/workload-entrypoint.sh
index e660a7904bb46..ce6e664a2c0de 100755
--- a/test/antithesis/workload/workload-entrypoint.sh
+++ b/test/antithesis/workload/workload-entrypoint.sh
@@ -25,24 +25,39 @@ until curl -sf http://materialized:6878/api/readyz > /dev/null 2>&1; do
 done
 echo "materialized is healthy."
 
-# Provision an unmanaged cluster backed by the external clusterd1 process.
-# This must run before setup-complete so Test Composer assertions can target
-# the cluster from the start. Idempotent — `IF NOT EXISTS` is unsupported on
-# `CREATE CLUSTER REPLICAS (...)`, so we query mz_clusters first.
+# Provision an unmanaged cluster with one replica per external clusterd
+# process. Multi-replica gives Antithesis the option to kill one
+# clusterd at a time without taking the workload offline, and exercises
+# the multi-replica compute/storage code paths (notably
+# `compute-replica-epoch-isolation`).
+#
+# This must run before setup-complete so Test Composer assertions can
+# target the cluster from the start. Idempotent — `IF NOT EXISTS` is
+# unsupported on `CREATE CLUSTER REPLICAS (...)`, so we query
+# mz_clusters first.
 existing=$(
     psql -h "$PGHOST" -p "$PGPORT_INTERNAL" -U "$PGUSER_INTERNAL" -tAc \
         "SELECT 1 FROM mz_clusters WHERE name = '$CLUSTER'"
 )
 if [[ -z "$existing" ]]; then
-    echo "Provisioning cluster '$CLUSTER' against clusterd1..."
+    echo "Provisioning cluster '$CLUSTER' with replicas on clusterd1 + clusterd2..."
     psql -h "$PGHOST" -p "$PGPORT_INTERNAL" -U "$PGUSER_INTERNAL" <<SQL
-CREATE CLUSTER ${CLUSTER} REPLICAS (replica1 (
-    STORAGECTL ADDRESSES ['clusterd1:2100'],
-    STORAGE ADDRESSES ['clusterd1:2103'],
-    COMPUTECTL ADDRESSES ['clusterd1:2101'],
-    COMPUTE ADDRESSES ['clusterd1:2102'],
-    WORKERS 1
-));
+CREATE CLUSTER ${CLUSTER} REPLICAS (
+    replica1 (
+        STORAGECTL ADDRESSES ['clusterd1:2100'],
+        STORAGE ADDRESSES ['clusterd1:2103'],
+        COMPUTECTL ADDRESSES ['clusterd1:2101'],
+        COMPUTE ADDRESSES ['clusterd1:2102'],
+        WORKERS 1
+    ),
+    replica2 (
+        STORAGECTL ADDRESSES ['clusterd2:2100'],
+        STORAGE ADDRESSES ['clusterd2:2103'],
+        COMPUTECTL ADDRESSES ['clusterd2:2101'],
+        COMPUTE ADDRESSES ['clusterd2:2102'],
+        WORKERS 1
+    )
+);
 GRANT ALL ON CLUSTER ${CLUSTER} TO ${PGUSER};
 SQL
 else

From 46664f8e43d955b70260d0c9beb0160b00ce1f97 Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Tue, 12 May 2026 11:30:10 -0400
Subject: [PATCH 30/65] test/antithesis: per-clusterd scratch volume so two
 replicas don't share RocksDB lock
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When I added clusterd2 in 4366c9e7, both clusterds inherited the
DEFAULT_MZ_VOLUMES list, which uses a single named volume scratch:/scratch.
Docker named volumes are shared across containers by name, so the two
clusterds mounted the same /scratch and contended for RocksDB locks at
/scratch/storage/upsert/<id>/<worker>/LOCK.

This wedged clusterd1: it could never open its upsert RocksDB
("Resource temporarily unavailable" on the LOCK file), entered
Stalled health with "Failed to rehydrate state", broadcast
suspend-and-restart, and looped retry-fail-suspend-restart for the
entire run. The continuous restart loop drove the upsert
feedback-driven snapshot replay path in ways that produced visibly
wrong durable state for the source — exactly the
upsert-state-rehydrates-correctly assertions caught in the
2026-05-12 05:39 UTC Antithesis report.

Fix: give each clusterd its own per-instance named volume for /scratch.
The other volumes stay shared because they don't take exclusive locks.

Also patch export-compose.py to auto-declare any service-referenced
named volume at the top level — Composition only auto-declares
DEFAULT_MZ_VOLUMES, so without this the custom names broke
`docker compose config`.
---
 test/antithesis/config/docker-compose.yaml |  6 +++--
 test/antithesis/export-compose.py          | 27 +++++++++++++++++++
 test/antithesis/mzcompose.py               | 31 ++++++++++++++++++++--
 3 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/test/antithesis/config/docker-compose.yaml b/test/antithesis/config/docker-compose.yaml
index 05d5e1cc27297..65fcc7f447fd9 100644
--- a/test/antithesis/config/docker-compose.yaml
+++ b/test/antithesis/config/docker-compose.yaml
@@ -205,7 +205,7 @@ services:
     - mzdata:/mzdata
     - mydata:/var/lib/mysql-files
     - tmp:/share/tmp
-    - scratch:/scratch
+    - clusterd1_scratch:/scratch
     restart: 'no'
     stop_grace_period: 120s
     platform: linux/amd64
@@ -244,7 +244,7 @@ services:
     - mzdata:/mzdata
     - mydata:/var/lib/mysql-files
     - tmp:/share/tmp
-    - scratch:/scratch
+    - clusterd2_scratch:/scratch
     restart: 'no'
     stop_grace_period: 120s
     platform: linux/amd64
@@ -475,3 +475,5 @@ volumes:
   tmp: null
   secrets: null
   scratch: null
+  clusterd1_scratch: null
+  clusterd2_scratch: null
diff --git a/test/antithesis/export-compose.py b/test/antithesis/export-compose.py
index 4e1fb5bece519..a204a76fdbf87 100644
--- a/test/antithesis/export-compose.py
+++ b/test/antithesis/export-compose.py
@@ -189,6 +189,31 @@ def strip_mzcompose_keys(svc: dict[str, Any]) -> None:
         svc.pop(key, None)
 
 
+def register_referenced_named_volumes(compose: dict[str, Any]) -> None:
+    """Declare any named volume referenced by a service that isn't already
+    declared at the top level. Docker Compose rejects the file otherwise.
+
+    mzcompose's `Composition` only auto-declares the fixed `DEFAULT_MZ_VOLUMES`
+    set; per-service custom named volumes (e.g. `clusterd1_scratch`) reference
+    names that have no top-level entry and fail `docker compose config`.
+    """
+    top_level: dict[str, Any] = compose.setdefault("volumes", {}) or {}
+    compose["volumes"] = top_level
+
+    for svc in compose.get("services", {}).values():
+        for entry in svc.get("volumes", []) or []:
+            if not isinstance(entry, str):
+                continue
+            # Bind mounts (`/host:/container`) start with `/`; named volumes
+            # are bare identifiers. We only auto-declare the latter.
+            if entry.startswith("/"):
+                continue
+            name = entry.split(":", 1)[0]
+            if not name or name in top_level:
+                continue
+            top_level[name] = None
+
+
 def main() -> None:
     # munge_services=False keeps ports bare (e.g., `6875` instead of
     # `127.0.0.1::6875`) — Antithesis is container-to-container, no host
@@ -207,6 +232,8 @@ def main() -> None:
         strip_incompatible_env(svc)
         strip_mzcompose_keys(svc)
 
+    register_referenced_named_volumes(c.compose)
+
     sys.stdout.write(HEADER)
     yaml.dump(c.compose, sys.stdout, default_flow_style=False, sort_keys=False)
 
diff --git a/test/antithesis/mzcompose.py b/test/antithesis/mzcompose.py
index d4b31841da46f..eb30696a43cf0 100644
--- a/test/antithesis/mzcompose.py
+++ b/test/antithesis/mzcompose.py
@@ -81,8 +81,35 @@ def __init__(self) -> None:
     # (notably the `compute-replica-epoch-isolation` property), and lets
     # Antithesis kill either replica's backing container without taking
     # the workload offline.
-    Clusterd(name="clusterd1"),
-    Clusterd(name="clusterd2"),
+    #
+    # Each clusterd MUST have its own /scratch volume — the upsert
+    # operator's RocksDB state lives there and takes an exclusive file
+    # lock. The DEFAULT_MZ_VOLUMES list uses a single named volume
+    # `scratch:/scratch` shared across containers; passing per-instance
+    # named volumes (`clusterd1_scratch`, `clusterd2_scratch`) keeps the
+    # locks separate while leaving the other volumes shared. Found via
+    # an Antithesis run where clusterd1 deadlocked retrying to open
+    # `/scratch/storage/upsert/u3/0/LOCK` because clusterd2 held it,
+    # which then drove a continuous suspend-and-restart loop that
+    # corrupted the upsert state.
+    Clusterd(
+        name="clusterd1",
+        volumes=[
+            "mzdata:/mzdata",
+            "mydata:/var/lib/mysql-files",
+            "tmp:/share/tmp",
+            "clusterd1_scratch:/scratch",
+        ],
+    ),
+    Clusterd(
+        name="clusterd2",
+        volumes=[
+            "mzdata:/mzdata",
+            "mydata:/var/lib/mysql-files",
+            "tmp:/share/tmp",
+            "clusterd2_scratch:/scratch",
+        ],
+    ),
     Materialized(
         external_blob_store=True,
         external_metadata_store=True,

From e98f3dc2bfd38f4c9367b6acfaae6ad98af4f155 Mon Sep 17 00:00:00 2001
From: Patrick Butler <patrick.butler@materialize.com>
Date: Tue, 12 May 2026 14:13:58 -0400
Subject: [PATCH 31/65] test/antithesis: add workload for mysql multithreaded
 replication chain

---
 test/antithesis/config/.env                   |  24 +-
 test/antithesis/config/docker-compose.yaml    |  75 ++++++
 test/antithesis/mzcompose.py                  |  36 +++
 .../properties/mysql-source-no-data-loss.md   | 120 ++++++++++
 .../scratchbook/property-catalog.md           |  20 ++
 test/antithesis/workload/Dockerfile           |   3 +-
 .../test/first_mysql_replica_setup.py         | 159 +++++++++++++
 test/antithesis/workload/test/helper_mysql.py | 159 +++++++++++++
 .../workload/test/helper_mysql_source.py      |  97 ++++++++
 .../test/parallel_driver_mysql_cdc.py         | 223 ++++++++++++++++++
 10 files changed, 895 insertions(+), 21 deletions(-)
 create mode 100644 test/antithesis/scratchbook/properties/mysql-source-no-data-loss.md
 create mode 100644 test/antithesis/workload/test/first_mysql_replica_setup.py
 create mode 100644 test/antithesis/workload/test/helper_mysql.py
 create mode 100644 test/antithesis/workload/test/helper_mysql_source.py
 create mode 100644 test/antithesis/workload/test/parallel_driver_mysql_cdc.py

diff --git a/test/antithesis/config/.env b/test/antithesis/config/.env
index d4f160a98596f..92cddafe58f23 100644
--- a/test/antithesis/config/.env
+++ b/test/antithesis/config/.env
@@ -1,21 +1,5 @@
-# Copyright Materialize, Inc. and contributors. All rights reserved.
-#
-# Use of this software is governed by the Business Source License
-# included in the LICENSE file at the root of this repository.
-#
-# As of the Change Date specified in that file, in accordance with
-# the Business Source License, use of this software will be governed
-# by the Apache License, Version 2.0.
-
-# Compose env-file for `test/antithesis/config/docker-compose.yaml`.
-# Tracked by git only so that the file exists for mzbuild's input
-# fingerprinting and survives `git clean -ffdX` between builds. The
-# committed values are placeholders — `build-antithesis.sh` overwrites
-# them in CI with refs to images pushed to Antithesis's GCP Artifact
-# Registry, and `make export-env` does the same with local-dev refs.
-#
-# If you see these placeholder values on a running cluster, your build
-# pipeline did not regenerate this file. Run:
+# GENERATED FILE — do not edit. Regenerate via:
 #   bin/pyactivate test/antithesis/export-env.py > test/antithesis/config/.env
-MATERIALIZED_IMAGE=placeholder-not-built
-ANTITHESIS_WORKLOAD_IMAGE=placeholder-not-built
+# Consumed by test/antithesis/config/docker-compose.yaml at compose-parse time.
+MATERIALIZED_IMAGE=ghcr.io/materializeinc/materialize/materialized:mzbuild-EMRA5ARAVQMKNFJIHZJTAPOEWMAGW5TX
+ANTITHESIS_WORKLOAD_IMAGE=ghcr.io/materializeinc/materialize/antithesis-workload:mzbuild-YKN4ZHJT7YAPYQLQV5BYTUPCRY2RUDRI
diff --git a/test/antithesis/config/docker-compose.yaml b/test/antithesis/config/docker-compose.yaml
index 65fcc7f447fd9..97d982367e63b 100644
--- a/test/antithesis/config/docker-compose.yaml
+++ b/test/antithesis/config/docker-compose.yaml
@@ -171,6 +171,71 @@ services:
       interval: 1s
       start_period: 120s
     platform: linux/amd64
+  mysql:
+    init: true
+    ports:
+    - 3306
+    environment:
+    - MYSQL_ROOT_PASSWORD=p@ssw0rd
+    command:
+    - --secure-file-priv=/var/lib/mysql-files
+    - --log-bin=mysql-bin
+    - --gtid_mode=ON
+    - --enforce_gtid_consistency=ON
+    - --binlog-format=row
+    - --binlog-row-image=full
+    - --binlog-row-metadata=full
+    - --server-id=1
+    - --max-connections=500
+    - --binlog_transaction_dependency_tracking=WRITESET
+    healthcheck:
+      test:
+      - CMD
+      - mysqladmin
+      - ping
+      - --password=p@ssw0rd
+      - --protocol=TCP
+      interval: 1s
+      start_period: 180s
+    volumes:
+    - mysqldata_primary:/var/lib/mysql
+    - mydata:/var/lib/mysql-files
+    image: mysql:9.5.0
+    platform: linux/amd64
+  mysql-replica:
+    init: true
+    ports:
+    - 3306
+    environment:
+    - MYSQL_ROOT_PASSWORD=p@ssw0rd
+    command:
+    - --secure-file-priv=/var/lib/mysql-files
+    - --log-bin=mysql-bin
+    - --gtid_mode=ON
+    - --enforce_gtid_consistency=ON
+    - --binlog-format=row
+    - --binlog-row-image=full
+    - --binlog-row-metadata=full
+    - --server-id=2
+    - --max-connections=500
+    - --log-slave-updates
+    - --skip-replica-start
+    - --replica_parallel_workers=4
+    - --replica_preserve_commit_order=ON
+    healthcheck:
+      test:
+      - CMD
+      - mysqladmin
+      - ping
+      - --password=p@ssw0rd
+      - --protocol=TCP
+      interval: 1s
+      start_period: 180s
+    volumes:
+    - mysqldata_replica:/var/lib/mysql
+    - mydata:/var/lib/mysql-files
+    image: mysql:9.5.0
+    platform: linux/amd64
   clusterd1:
     entrypoint:
     - tini
@@ -419,6 +484,7 @@ services:
     - MZ_NO_BUILTIN_POSTGRES=1
     - MZ_NO_BUILTIN_COCKROACH=1
     - MZ_ADAPTER_STASH_URL=postgres://root@postgres-metadata:26257?options=--search_path=adapter
+    - MZ_LICENSE_KEY=/license_key/license_key
     volumes:
     - mzdata:/mzdata
     - mydata:/var/lib/mysql-files
@@ -449,6 +515,10 @@ services:
         condition: service_healthy
       schema-registry:
         condition: service_started
+      mysql:
+        condition: service_healthy
+      mysql-replica:
+        condition: service_healthy
     environment:
     - PGHOST=materialized
     - PGPORT=6875
@@ -458,6 +528,9 @@ services:
     - KAFKA_BROKER=kafka:9092
     - SCHEMA_REGISTRY_URL=http://schema-registry:8081
     - MZ_ANTITHESIS_CLUSTER=antithesis_cluster
+    - MYSQL_HOST=mysql
+    - MYSQL_REPLICA_HOST=mysql-replica
+    - MYSQL_PASSWORD=p@ssw0rd
     platform: linux/amd64
     image: ${ANTITHESIS_WORKLOAD_IMAGE}
 networks: {}
@@ -475,5 +548,7 @@ volumes:
   tmp: null
   secrets: null
   scratch: null
+  mysqldata_primary: null
+  mysqldata_replica: null
   clusterd1_scratch: null
   clusterd2_scratch: null
diff --git a/test/antithesis/mzcompose.py b/test/antithesis/mzcompose.py
index eb30696a43cf0..c799269cbd216 100644
--- a/test/antithesis/mzcompose.py
+++ b/test/antithesis/mzcompose.py
@@ -34,6 +34,7 @@
 from materialize.mzcompose.services.kafka import Kafka
 from materialize.mzcompose.services.materialized import Materialized
 from materialize.mzcompose.services.minio import Minio
+from materialize.mzcompose.services.mysql import MySql, create_mysql_server_args
 from materialize.mzcompose.services.postgres import PostgresMetadata
 from materialize.mzcompose.services.schema_registry import SchemaRegistry
 from materialize.mzcompose.services.zookeeper import Zookeeper
@@ -51,6 +52,8 @@ def __init__(self) -> None:
                 "clusterd2": {"condition": "service_started"},
                 "kafka": {"condition": "service_healthy"},
                 "schema-registry": {"condition": "service_started"},
+                "mysql": {"condition": "service_healthy"},
+                "mysql-replica": {"condition": "service_healthy"},
             },
             "environment": [
                 "PGHOST=materialized",
@@ -64,6 +67,10 @@ def __init__(self) -> None:
                 # Name of the unmanaged cluster the workload-entrypoint
                 # provisions against clusterd1 before emitting setup-complete.
                 "MZ_ANTITHESIS_CLUSTER=antithesis_cluster",
+                # MySQL primary and replica connection details.
+                "MYSQL_HOST=mysql",
+                "MYSQL_REPLICA_HOST=mysql-replica",
+                f"MYSQL_PASSWORD={MySql.DEFAULT_ROOT_PASSWORD}",
             ],
         }
         super().__init__(name="workload", config=config)
@@ -75,6 +82,33 @@ def __init__(self) -> None:
     Zookeeper(),
     Kafka(auto_create_topics=True),
     SchemaRegistry(),
+    # MySQL primary — GTID-enabled with WRITESET dependency tracking so the
+    # replica can safely use parallel workers without losing commit order.
+    MySql(
+        use_seeded_image=False,
+        volumes=[
+            "mysqldata_primary:/var/lib/mysql",
+            "mydata:/var/lib/mysql-files",
+        ],
+        additional_args=create_mysql_server_args(server_id="1", is_master=True)
+        + ["--binlog_transaction_dependency_tracking=WRITESET"],
+    ),
+    # MySQL replica — multithreaded replication (4 workers, commit-order
+    # preserved).  Replication is configured at runtime by
+    # first_mysql_replica_setup.py after both containers are healthy.
+    MySql(
+        name="mysql-replica",
+        use_seeded_image=False,
+        volumes=[
+            "mysqldata_replica:/var/lib/mysql",
+            "mydata:/var/lib/mysql-files",
+        ],
+        additional_args=create_mysql_server_args(server_id="2", is_master=False)
+        + [
+            "--replica_parallel_workers=4",
+            "--replica_preserve_commit_order=ON",
+        ],
+    ),
     # Two clusterd processes, one per replica of the unmanaged
     # `antithesis_cluster`. Provisioning both replicas in the same cluster
     # exercises multi-replica source ingestion and compute paths
@@ -138,6 +172,8 @@ def workflow_default(c: Composition) -> None:
         "schema-registry",
         "clusterd1",
         "clusterd2",
+        "mysql",
+        "mysql-replica",
     )
     c.up("materialized")
     c.up("workload")
diff --git a/test/antithesis/scratchbook/properties/mysql-source-no-data-loss.md b/test/antithesis/scratchbook/properties/mysql-source-no-data-loss.md
new file mode 100644
index 0000000000000..19f6d02d68974
--- /dev/null
+++ b/test/antithesis/scratchbook/properties/mysql-source-no-data-loss.md
@@ -0,0 +1,120 @@
+# mysql-source-no-data-loss — Every Row Written to MySQL Primary Is Eventually Visible in Materialize
+
+## Summary
+
+Every row inserted to the MySQL primary must eventually appear — with the
+correct value — in the Materialize CDC source that reads from the
+multithreaded MySQL replica. The pipeline is:
+
+```
+MySQL primary  --GTID binlog-->  MySQL replica (4 parallel workers)
+                                      |
+                              Materialize CDC source
+                              (antithesis_cluster)
+                                      |
+                              antithesis_cdc table
+```
+
+## Instrumentation
+
+**Workload-side** — `test/antithesis/workload/test/parallel_driver_mysql_cdc.py`.
+
+Each `parallel_driver_` invocation:
+1. Assigns a per-invocation `batch_id` prefix (Antithesis-seeded RNG).
+2. Inserts `ROWS_PER_INVOCATION` (20) rows to `antithesis.cdc_test` on the
+   MySQL primary, recording the expected `{id → value}` map locally.
+3. Requests an Antithesis quiet period (25 s) and polls `antithesis_cdc` in
+   Materialize until all expected rows appear or the 90 s budget expires.
+4. Fires:
+   - `sometimes("mysql: CDC source caught up to all primary inserts after quiet period", …)`
+     — liveness anchor; confirms at least one invocation reaches full catchup.
+   - `always("mysql: CDC source row has correct value after catchup", …)` — safety;
+     fired once per row, catches wrong-value corruption.
+   - `always("mysql: CDC source row count matches inserted count after catchup", …)`
+     — safety; catches extra phantom rows (count > expected) or missing rows
+     (count < expected) at the batch level.
+
+**First-run setup** — `test/antithesis/workload/test/first_mysql_replica_setup.py`.
+
+Runs once per Antithesis timeline before any parallel drivers start:
+- Creates `antithesis.cdc_test` on the primary.
+- Configures the replica channel (`CHANGE REPLICATION SOURCE TO … SOURCE_AUTO_POSITION=1`).
+- Sets `replica_parallel_workers = 4`, `replica_preserve_commit_order = ON`.
+- Starts the replica.
+- Creates the Materialize connection (`antithesis_mysql_conn`), source
+  (`mysql_cdc_source`), and table (`antithesis_cdc`).
+- Fires `reachable("mysql: first-run setup complete …")` so Antithesis can
+  confirm the setup path is exercised in every timeline.
+- Fires `sometimes("mysql replica: antithesis.cdc_test replicated from primary within 90s", …)`
+  to confirm initial replication is flowing before the source is created.
+
+## Why This Property Matters
+
+MySQL CDC via a multithreaded replica is a distinct and failure-prone code
+path compared to the Kafka/upsert path that the existing drivers exercise.
+Key fault scenarios exposed:
+
+- **Replica lag under faults** — if Antithesis kills the MySQL replica
+  container, the replica restarts from its persisted GTID position (the
+  replica data volume is persistent). The Materialize source must reconnect
+  and resume without dropping rows.
+
+- **Parallel replication ordering** — with 4 parallel workers and
+  `replica_preserve_commit_order=ON`, the replica applies transactions
+  concurrently but in primary commit order. Antithesis can inject scheduling
+  jitter that stresses the ordering protocol.
+
+- **Primary kills** — if Antithesis kills the MySQL primary, the replica
+  loses its upstream. Materialize's CDC source must handle the replica going
+  silent gracefully (not panic, not report wrong data).
+
+- **Materialize clusterd restarts** — the MySQL CDC source resumes from the
+  last committed GTID in the persist shard, similar to the Kafka source
+  resume-offset logic. Existing `storage-command-replay-idempotent` property
+  is stressed through the MySQL code path.
+
+## Assertion Types Chosen
+
+- `sometimes(…)` for liveness (catchup): the system must make progress at
+  least once per run. Under heavy fault injection catchup may not complete
+  every invocation; that's expected. We care that it succeeds at least once.
+
+- `always(…)` for safety (per-row value, batch count): once we've confirmed
+  catchup, every observable row must be correct. This is a hard safety
+  invariant.
+
+- `reachable(…)` for setup completion: ensures Antithesis counts the
+  first-run setup as an exercised path across the run.
+
+## Related Properties
+
+- `storage-command-replay-idempotent` — MySQL CDC resume on clusterd restart
+  exercises the same command-history replay path as Kafka sources.
+- `fault-recovery-exercised` — the `sometimes(…)` recovery probe also fires
+  after MySQL-induced coordinator failures.
+- `kafka-source-survives-clusterd-restart` — shares the "source resumes after
+  storage worker kill" structure; MySQL adds the replica-replication dimension.
+
+## Schema
+
+```sql
+-- MySQL (primary and replica via replication):
+CREATE TABLE antithesis.cdc_test (
+    id         VARCHAR(64) NOT NULL PRIMARY KEY,
+    batch_id   VARCHAR(64) NOT NULL,
+    value      TEXT NOT NULL,
+    updated_at TIMESTAMP(6) DEFAULT CURRENT_TIMESTAMP(6)
+                ON UPDATE CURRENT_TIMESTAMP(6)
+);
+
+-- Materialize:
+CREATE SECRET antithesis_mysql_password AS '…';
+CREATE CONNECTION antithesis_mysql_conn TO MYSQL (
+    HOST 'mysql-replica', USER 'root',
+    PASSWORD SECRET antithesis_mysql_password
+);
+CREATE SOURCE mysql_cdc_source IN CLUSTER antithesis_cluster
+    FROM MYSQL CONNECTION antithesis_mysql_conn;
+CREATE TABLE antithesis_cdc
+    FROM SOURCE mysql_cdc_source (REFERENCE antithesis.cdc_test);
+```
diff --git a/test/antithesis/scratchbook/property-catalog.md b/test/antithesis/scratchbook/property-catalog.md
index b09ceb0a00e7c..2c308cf3e2e2b 100644
--- a/test/antithesis/scratchbook/property-catalog.md
+++ b/test/antithesis/scratchbook/property-catalog.md
@@ -2,6 +2,7 @@
 commit: 007c7af9d9970fb2030c7212368b232e0fbc363e
 updated: 2026-05-12
 ---
+<!-- Category 8 (MySQL CDC) added 2026-05-12: mysql-source-no-data-loss -->
 
 # Property Catalog: Materialize
 
@@ -404,6 +405,25 @@ Properties specific to the Kafka source ingestion pipeline: `KafkaSourceReader`
 | **Antithesis Angle** | Inject persist consensus latency, kill+restart concurrently to create a competing writer, race the metadata fetcher's partition-add against a mint that is already in flight. The retry loop in `mint()` has no upper bound; this property confirms it is not livelocked even under adversarial schedules. |
 | **Why It Matters** | A livelocked mint loop manifests as a source that never advances its frontier — externally indistinguishable from a stalled Kafka consumer, but caused inside Materialize. |
 
+## Category 8: MySQL CDC Source
+
+Properties specific to Materialize's MySQL CDC source pipeline, which reads
+from a multithreaded MySQL replica. The topology adds a MySQL primary (GTID +
+WRITESET dependency tracking) and a MySQL replica (4 parallel workers,
+commit-order preservation) to the Antithesis environment.
+
+### mysql-source-no-data-loss — Every Row Written to MySQL Primary Is Eventually Visible
+
+| | |
+|---|---|
+| **Type** | Liveness + Safety |
+| **Priority** | P1 — end-to-end correctness of the MySQL CDC pipeline; tests a distinct code path from Kafka |
+| **Status** | **Implemented (workload-side)** — `test/antithesis/workload/test/parallel_driver_mysql_cdc.py` + `first_mysql_replica_setup.py`. Each `parallel_driver_` invocation inserts 20 rows to MySQL primary, waits for a quiet period, then polls `antithesis_cdc` until all rows appear (or 90 s budget expires). `always("mysql: CDC source row has correct value after catchup", …)` and `always("mysql: CDC source row count matches inserted count after catchup", …)` fire per-row and per-batch after confirmed catchup; `sometimes("mysql: CDC source caught up to all primary inserts after quiet period", …)` is the liveness anchor. The `first_mysql_replica_setup.py` creates the MySQL schema, configures multithreaded replication (4 workers, `replica_preserve_commit_order=ON`), and creates the Materialize connection/source/table, firing `reachable("mysql: first-run setup complete …")` as a coverage anchor. |
+| **Property** | After inserting a row to the MySQL primary (via the binlog + GTID-based multithreaded replica), the Materialize CDC source eventually contains that row with the correct value. |
+| **Invariant** | `Always`: after catchup, for every row inserted to `antithesis.cdc_test` on the primary, `SELECT value FROM antithesis_cdc WHERE id = ?` returns the expected value. `Sometimes`: catchup completes within the quiet-period budget at least once per run. |
+| **Antithesis Angle** | Kills to the MySQL replica container (replica restarts from persisted GTID position); kills to the MySQL primary (replica and Materialize source must handle upstream silence gracefully); clusterd restarts (MySQL CDC resume exercises the same `storage-command-replay-idempotent` path as Kafka); parallel worker scheduling jitter that stresses the `replica_preserve_commit_order` protocol. |
+| **Why It Matters** | MySQL CDC is a distinct ingestion code path from Kafka. Wrong behavior here — dropped rows, wrong values after restart, duplicate rows after resume — is not caught by the Kafka-source drivers. |
+
 ### offset-known-not-below-committed — Source Statistics Causality
 
 | | |
diff --git a/test/antithesis/workload/Dockerfile b/test/antithesis/workload/Dockerfile
index b72a6b541d818..5cca619ed8234 100644
--- a/test/antithesis/workload/Dockerfile
+++ b/test/antithesis/workload/Dockerfile
@@ -22,7 +22,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 RUN pip install --no-cache-dir \
     psycopg[binary]==3.2.9 \
     confluent-kafka==2.8.0 \
-    antithesis==0.2.0
+    antithesis==0.2.0 \
+    PyMySQL==1.1.1
 
 # setup-complete script
 COPY setup-complete.sh /usr/local/bin/setup-complete.sh
diff --git a/test/antithesis/workload/test/first_mysql_replica_setup.py b/test/antithesis/workload/test/first_mysql_replica_setup.py
new file mode 100644
index 0000000000000..4380b5f4bd40d
--- /dev/null
+++ b/test/antithesis/workload/test/first_mysql_replica_setup.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python3
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Antithesis first_ command: configure MySQL multithreaded replica replication
+and create the Materialize MySQL CDC source.
+
+Runs once per Antithesis timeline before any parallel/singleton drivers start.
+Steps:
+  1. Wait for both MySQL containers to accept connections.
+  2. Create the `antithesis` database and `cdc_test` table on the primary.
+  3. Configure the replica to replicate from the primary via GTID with 4
+     parallel worker threads (multithreaded replication).
+  4. Start the replica.
+  5. Wait for `antithesis.cdc_test` to appear on the replica (confirms
+     replication is flowing).
+  6. Create the Materialize connection, source, and table from the replica.
+"""
+
+from __future__ import annotations
+
+import logging
+import sys
+import time
+
+import helper_mysql
+from helper_mysql_source import ensure_mysql_cdc_source
+
+from antithesis.assertions import reachable, sometimes
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
+)
+LOG = logging.getLogger("first.mysql_replica_setup")
+
+
+def setup_primary() -> None:
+    """Create the antithesis schema and cdc_test table on the MySQL primary."""
+    LOG.info("creating antithesis database and cdc_test table on primary")
+    helper_mysql.execute_primary("CREATE DATABASE IF NOT EXISTS antithesis")
+    helper_mysql.execute_primary(
+        """
+        CREATE TABLE IF NOT EXISTS antithesis.cdc_test (
+            id VARCHAR(64) NOT NULL PRIMARY KEY,
+            batch_id VARCHAR(64) NOT NULL,
+            value TEXT NOT NULL,
+            updated_at TIMESTAMP(6) DEFAULT CURRENT_TIMESTAMP(6)
+                ON UPDATE CURRENT_TIMESTAMP(6)
+        )
+        """,
+        database="antithesis",
+    )
+    LOG.info("antithesis.cdc_test ready on primary")
+
+
+def configure_replica() -> None:
+    """Configure the MySQL replica to replicate from the primary.
+
+    Uses GTID auto-positioning with 4 parallel workers. The replica starts
+    with --skip-replica-start so we configure the channel before starting.
+    Idempotent: stops and resets any existing channel first.
+    """
+    LOG.info(
+        "configuring replica to replicate from %s with 4 parallel workers",
+        helper_mysql.MYSQL_HOST,
+    )
+    # Stop and reset any existing channel (no-op on a fresh container).
+    try:
+        helper_mysql.execute_replica("STOP REPLICA")
+    except Exception:  # noqa: BLE001
+        pass
+    try:
+        helper_mysql.execute_replica("RESET REPLICA ALL")
+    except Exception:  # noqa: BLE001
+        pass
+
+    helper_mysql.execute_replica(
+        f"CHANGE REPLICATION SOURCE TO "
+        f"SOURCE_HOST='{helper_mysql.MYSQL_HOST}', "
+        f"SOURCE_USER='root', "
+        f"SOURCE_PASSWORD='{helper_mysql.MYSQL_PASSWORD}', "
+        f"SOURCE_AUTO_POSITION=1, "
+        f"GET_SOURCE_PUBLIC_KEY=1"
+    )
+    # Set parallel replication parameters before starting.
+    helper_mysql.execute_replica("SET GLOBAL replica_parallel_workers = 4")
+    helper_mysql.execute_replica("SET GLOBAL replica_preserve_commit_order = ON")
+    helper_mysql.execute_replica("START REPLICA")
+    LOG.info("MySQL replica started")
+
+
+def wait_for_replica_table(timeout_s: float = 90.0) -> bool:
+    """Wait until antithesis.cdc_test is visible on the replica.
+
+    Returns True when the table appears (replication is flowing), False on
+    timeout.
+    """
+    deadline = time.monotonic() + timeout_s
+    while time.monotonic() < deadline:
+        try:
+            rows = helper_mysql.query_replica(
+                "SELECT 1 FROM information_schema.tables "
+                "WHERE table_schema = 'antithesis' AND table_name = 'cdc_test'",
+            )
+            if rows:
+                LOG.info("antithesis.cdc_test visible on replica — replication flowing")
+                return True
+        except Exception as exc:  # noqa: BLE001
+            LOG.info("waiting for replica table: %s", exc)
+        time.sleep(2)
+    LOG.warning("timed out waiting for antithesis.cdc_test on replica")
+    return False
+
+
+def main() -> int:
+    LOG.info("waiting for MySQL primary (%s)...", helper_mysql.MYSQL_HOST)
+    helper_mysql.wait_for_primary()
+
+    LOG.info("waiting for MySQL replica (%s)...", helper_mysql.MYSQL_REPLICA_HOST)
+    helper_mysql.wait_for_replica()
+
+    setup_primary()
+    configure_replica()
+
+    replica_ready = wait_for_replica_table()
+    sometimes(
+        replica_ready,
+        "mysql replica: antithesis.cdc_test replicated from primary within 90s",
+        {
+            "primary": helper_mysql.MYSQL_HOST,
+            "replica": helper_mysql.MYSQL_REPLICA_HOST,
+        },
+    )
+    if not replica_ready:
+        # Proceed anyway — replication may catch up before Materialize tries to
+        # validate the source, but log a warning so triage can correlate.
+        LOG.warning("replica table not yet visible; proceeding with source creation")
+
+    ensure_mysql_cdc_source()
+
+    reachable(
+        "mysql: first-run setup complete — replica configured, Materialize source created",
+        {
+            "primary": helper_mysql.MYSQL_HOST,
+            "replica": helper_mysql.MYSQL_REPLICA_HOST,
+        },
+    )
+    LOG.info("MySQL CDC setup complete")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/test/antithesis/workload/test/helper_mysql.py b/test/antithesis/workload/test/helper_mysql.py
new file mode 100644
index 0000000000000..e99b3656cb4dd
--- /dev/null
+++ b/test/antithesis/workload/test/helper_mysql.py
@@ -0,0 +1,159 @@
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""MySQL connection helpers for Antithesis drivers.
+
+Connects to the MySQL primary and replica via PyMySQL. All calls retry
+transient network and operational errors up to a fixed budget so the
+workload keeps progressing under fault injection.
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+import time
+
+import pymysql
+import pymysql.cursors
+
+LOG = logging.getLogger("antithesis.helper_mysql")
+
+MYSQL_HOST = os.environ.get("MYSQL_HOST", "mysql")
+MYSQL_REPLICA_HOST = os.environ.get("MYSQL_REPLICA_HOST", "mysql-replica")
+MYSQL_PORT = int(os.environ.get("MYSQL_PORT", "3306"))
+MYSQL_PASSWORD = os.environ.get("MYSQL_PASSWORD", "p@ssw0rd")
+
+_RETRY_BUDGET_S = 120
+_RETRY_INITIAL_S = 0.5
+_RETRY_MAX_S = 4.0
+
+
+def _retryable(exc: BaseException) -> bool:
+    return isinstance(exc, (pymysql.OperationalError, pymysql.InterfaceError))
+
+
+def _open(host: str, database: str) -> pymysql.connections.Connection:
+    """Open a single MySQL connection with retries on transient errors."""
+    deadline = time.monotonic() + _RETRY_BUDGET_S
+    backoff = _RETRY_INITIAL_S
+    while True:
+        try:
+            return pymysql.connect(
+                host=host,
+                port=MYSQL_PORT,
+                user="root",
+                password=MYSQL_PASSWORD,
+                database=database,
+                connect_timeout=15,
+                autocommit=True,
+            )
+        except Exception as exc:  # noqa: BLE001
+            if not _retryable(exc) or time.monotonic() > deadline:
+                raise
+            LOG.info(
+                "mysql connect to %s retrying after %s; backoff=%.2fs",
+                host,
+                exc,
+                backoff,
+            )
+            time.sleep(backoff)
+            backoff = min(backoff * 2, _RETRY_MAX_S)
+
+
+def _execute(host: str, sql: str, params: tuple = (), database: str = "mysql") -> None:
+    deadline = time.monotonic() + _RETRY_BUDGET_S
+    backoff = _RETRY_INITIAL_S
+    while True:
+        try:
+            conn = _open(host, database)
+            with conn.cursor() as cur:
+                cur.execute(sql, params)
+            conn.close()
+            return
+        except Exception as exc:  # noqa: BLE001
+            if not _retryable(exc) or time.monotonic() > deadline:
+                raise
+            LOG.info("mysql execute on %s retrying after %s", host, exc)
+            time.sleep(backoff)
+            backoff = min(backoff * 2, _RETRY_MAX_S)
+
+
+def _query(
+    host: str, sql: str, params: tuple = (), database: str = "mysql"
+) -> list[tuple]:
+    deadline = time.monotonic() + _RETRY_BUDGET_S
+    backoff = _RETRY_INITIAL_S
+    while True:
+        try:
+            conn = _open(host, database)
+            with conn.cursor() as cur:
+                cur.execute(sql, params)
+                result = list(cur.fetchall())
+            conn.close()
+            return result
+        except Exception as exc:  # noqa: BLE001
+            if not _retryable(exc) or time.monotonic() > deadline:
+                raise
+            LOG.info("mysql query on %s retrying after %s", host, exc)
+            time.sleep(backoff)
+            backoff = min(backoff * 2, _RETRY_MAX_S)
+
+
+def execute_primary(sql: str, params: tuple = (), database: str = "mysql") -> None:
+    """Execute a statement on the MySQL primary."""
+    _execute(MYSQL_HOST, sql, params, database)
+
+
+def execute_replica(sql: str, params: tuple = (), database: str = "mysql") -> None:
+    """Execute a statement on the MySQL replica."""
+    _execute(MYSQL_REPLICA_HOST, sql, params, database)
+
+
+def query_primary(
+    sql: str, params: tuple = (), database: str = "mysql"
+) -> list[tuple]:
+    """Run a query on the MySQL primary and return all rows."""
+    return _query(MYSQL_HOST, sql, params, database)
+
+
+def query_replica(
+    sql: str, params: tuple = (), database: str = "mysql"
+) -> list[tuple]:
+    """Run a query on the MySQL replica and return all rows."""
+    return _query(MYSQL_REPLICA_HOST, sql, params, database)
+
+
+def wait_for_host(host: str, timeout_s: float = 180.0) -> None:
+    """Block until MySQL on `host` accepts connections."""
+    deadline = time.monotonic() + timeout_s
+    while time.monotonic() < deadline:
+        try:
+            conn = pymysql.connect(
+                host=host,
+                port=MYSQL_PORT,
+                user="root",
+                password=MYSQL_PASSWORD,
+                connect_timeout=5,
+            )
+            conn.close()
+            LOG.info("mysql %s is ready", host)
+            return
+        except Exception as exc:  # noqa: BLE001
+            LOG.info("waiting for mysql %s: %s", host, exc)
+            time.sleep(2)
+    raise TimeoutError(f"MySQL at {host} not ready after {timeout_s}s")
+
+
+def wait_for_primary(timeout_s: float = 180.0) -> None:
+    wait_for_host(MYSQL_HOST, timeout_s)
+
+
+def wait_for_replica(timeout_s: float = 180.0) -> None:
+    wait_for_host(MYSQL_REPLICA_HOST, timeout_s)
diff --git a/test/antithesis/workload/test/helper_mysql_source.py b/test/antithesis/workload/test/helper_mysql_source.py
new file mode 100644
index 0000000000000..6572eddc9c7e4
--- /dev/null
+++ b/test/antithesis/workload/test/helper_mysql_source.py
@@ -0,0 +1,97 @@
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Idempotent setup for the Antithesis MySQL CDC source in Materialize.
+
+The MySQL CDC pipeline:
+  mysql (primary) --binlog--> mysql-replica --CDC--> Materialize
+
+Materialize reads from the replica so that faults to the replica exercise
+the Materialize source recovery path independently of faults to the primary.
+
+Objects created in Materialize:
+  - SECRET  antithesis_mysql_password
+  - CONNECTION antithesis_mysql_conn  -> mysql-replica
+  - SOURCE  mysql_cdc_source          (IN CLUSTER antithesis_cluster)
+  - TABLE   antithesis_cdc            (REFERENCE antithesis.cdc_test)
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+
+import psycopg
+
+from helper_pg import create_source_idempotent, execute_retry, query_retry
+
+LOG = logging.getLogger("antithesis.helper_mysql_source")
+
+CLUSTER = os.environ.get("MZ_ANTITHESIS_CLUSTER", "antithesis_cluster")
+MYSQL_REPLICA_HOST = os.environ.get("MYSQL_REPLICA_HOST", "mysql-replica")
+MYSQL_PASSWORD = os.environ.get("MYSQL_PASSWORD", "p@ssw0rd")
+
+MYSQL_DATABASE = "antithesis"
+MYSQL_TABLE = "cdc_test"
+
+SECRET_NAME = "antithesis_mysql_password"
+CONNECTION_NAME = "antithesis_mysql_conn"
+SOURCE_NAME = "mysql_cdc_source"
+TABLE_NAME = "antithesis_cdc"
+
+
+def ensure_mysql_connection() -> None:
+    """Create the MySQL secret and connection in Materialize (idempotent)."""
+    execute_retry(
+        f"CREATE SECRET IF NOT EXISTS {SECRET_NAME} AS '{MYSQL_PASSWORD}'"
+    )
+    execute_retry(
+        f"CREATE CONNECTION IF NOT EXISTS {CONNECTION_NAME} TO MYSQL ("
+        f"HOST '{MYSQL_REPLICA_HOST}', "
+        f"USER 'root', "
+        f"PASSWORD SECRET {SECRET_NAME}"
+        f")"
+    )
+    LOG.info("mysql connection %s ready (replica=%s)", CONNECTION_NAME, MYSQL_REPLICA_HOST)
+
+
+def ensure_mysql_cdc_table() -> None:
+    """Create the Materialize table from the MySQL CDC source (idempotent)."""
+    try:
+        execute_retry(
+            f"CREATE TABLE IF NOT EXISTS {TABLE_NAME} "
+            f"FROM SOURCE {SOURCE_NAME} "
+            f"(REFERENCE {MYSQL_DATABASE}.{MYSQL_TABLE})"
+        )
+    except psycopg.errors.InternalError as exc:
+        if "already exists" not in str(exc):
+            raise
+        rows = query_retry("SELECT 1 FROM mz_tables WHERE name = %s", (TABLE_NAME,))
+        if rows:
+            LOG.info("table %s landed concurrently; tolerating collision", TABLE_NAME)
+            return
+        raise
+    LOG.info("mysql cdc table %s ready", TABLE_NAME)
+
+
+def ensure_mysql_cdc_source() -> None:
+    """Create the full MySQL CDC pipeline in Materialize (idempotent).
+
+    Requires antithesis.cdc_test to already exist on the MySQL replica.
+    Call first_mysql_replica_setup.py before this in any standalone use.
+    """
+    ensure_mysql_connection()
+    create_source_idempotent(
+        f"CREATE SOURCE IF NOT EXISTS {SOURCE_NAME} "
+        f"IN CLUSTER {CLUSTER} "
+        f"FROM MYSQL CONNECTION {CONNECTION_NAME}",
+        SOURCE_NAME,
+    )
+    LOG.info("mysql cdc source %s ready", SOURCE_NAME)
+    ensure_mysql_cdc_table()
diff --git a/test/antithesis/workload/test/parallel_driver_mysql_cdc.py b/test/antithesis/workload/test/parallel_driver_mysql_cdc.py
new file mode 100644
index 0000000000000..67a9627e1e386
--- /dev/null
+++ b/test/antithesis/workload/test/parallel_driver_mysql_cdc.py
@@ -0,0 +1,223 @@
+#!/usr/bin/env python3
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Antithesis driver for property `mysql-source-no-data-loss`.
+
+Every row inserted to the MySQL primary must eventually appear — with the
+correct value — in the Materialize CDC source that reads from the
+multithreaded replica.
+
+Each invocation:
+  1. Checks the MySQL CDC source exists (created by first_mysql_replica_setup).
+  2. Picks a per-invocation `batch_id` prefix so concurrent drivers don't
+     collide.
+  3. Inserts ROWS_PER_INVOCATION rows to the MySQL primary, recording the
+     expected {id → value} map locally.
+  4. Requests an Antithesis quiet period and polls the Materialize source
+     table until all expected rows appear (or the budget expires).
+  5. Asserts correctness via `always(...)` on count and per-row values.
+     A `sometimes(...)` liveness anchor fires on successful catchup.
+
+This is a `parallel_driver_` — Antithesis runs many concurrent instances.
+Each assigns itself a fresh prefix from the Antithesis-seeded RNG so
+parallel drivers exercise the MySQL CDC path simultaneously without
+interfering with each other's expected-state model.
+"""
+
+from __future__ import annotations
+
+import logging
+import sys
+import time
+
+import helper_mysql
+import helper_random
+from helper_mysql_source import SOURCE_NAME, TABLE_NAME
+from helper_pg import query_retry
+from helper_quiet import request_quiet_period
+
+from antithesis.assertions import always, sometimes
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
+)
+LOG = logging.getLogger("driver.mysql_cdc")
+
+ROWS_PER_INVOCATION = 20
+QUIET_PERIOD_S = 25
+CATCHUP_TIMEOUT_S = 90.0
+POLL_INTERVAL_S = 1.0
+
+
+def _source_exists() -> bool:
+    rows = query_retry("SELECT 1 FROM mz_sources WHERE name = %s", (SOURCE_NAME,))
+    return bool(rows)
+
+
+def _insert_rows(batch_id: str) -> dict[str, str]:
+    """Insert ROWS_PER_INVOCATION rows to the MySQL primary.
+
+    Returns {id → value} for every successfully inserted row.
+    """
+    expected: dict[str, str] = {}
+    for i in range(ROWS_PER_INVOCATION):
+        row_id = f"{batch_id}:{i}"
+        value = f"v{helper_random.random_int(0, 9999):04d}"
+        try:
+            helper_mysql.execute_primary(
+                "INSERT INTO antithesis.cdc_test (id, batch_id, value) "
+                "VALUES (%s, %s, %s) "
+                "ON DUPLICATE KEY UPDATE value = VALUES(value), batch_id = VALUES(batch_id)",
+                (row_id, batch_id, value),
+                database="antithesis",
+            )
+            expected[row_id] = value
+        except Exception as exc:  # noqa: BLE001
+            # Under fault injection a write to the primary may fail. Skip the
+            # row rather than crashing so the driver keeps inserting others.
+            LOG.info("insert failed for row %s: %s; skipping", row_id, exc)
+    return expected
+
+
+def _wait_for_catchup(batch_id: str, expected_count: int) -> bool:
+    """Poll Materialize until all expected rows for `batch_id` appear.
+
+    Returns True when `COUNT(*) WHERE batch_id = ?` reaches expected_count,
+    False on timeout.
+    """
+    deadline = time.monotonic() + CATCHUP_TIMEOUT_S
+    last_seen = -1
+    while time.monotonic() < deadline:
+        try:
+            rows = query_retry(
+                f"SELECT COUNT(*)::bigint FROM {TABLE_NAME} WHERE batch_id = %s",
+                (batch_id,),
+            )
+            count = int(rows[0][0]) if rows and rows[0][0] is not None else 0
+        except Exception as exc:  # noqa: BLE001
+            LOG.info("catchup poll failed: %s; retrying", exc)
+            time.sleep(POLL_INTERVAL_S)
+            continue
+
+        if count != last_seen:
+            LOG.info(
+                "mysql cdc catchup: batch=%s observed=%d target=%d",
+                batch_id,
+                count,
+                expected_count,
+            )
+            last_seen = count
+
+        if count >= expected_count:
+            return True
+        time.sleep(POLL_INTERVAL_S)
+
+    LOG.warning(
+        "mysql cdc catchup timeout: batch=%s last_seen=%d target=%d",
+        batch_id,
+        last_seen,
+        expected_count,
+    )
+    return False
+
+
+def _check_rows(expected: dict[str, str]) -> None:
+    """Assert every expected row has the correct value in the Materialize source."""
+    for row_id, want in expected.items():
+        rows = query_retry(
+            f"SELECT value FROM {TABLE_NAME} WHERE id = %s",
+            (row_id,),
+        )
+        found = bool(rows)
+        observed = rows[0][0] if found else None
+        always(
+            found and observed == want,
+            "mysql: CDC source row has correct value after catchup",
+            {
+                "source": TABLE_NAME,
+                "id": row_id,
+                "expected_value": want,
+                "observed_present": found,
+                "observed_value": observed,
+            },
+        )
+
+
+def main() -> int:
+    if not _source_exists():
+        # first_mysql_replica_setup must run before this driver. Outside
+        # Antithesis (e.g. snouty validate) the source may not exist yet —
+        # exit cleanly rather than erroring so validate can still proceed.
+        LOG.warning(
+            "mysql cdc source %s not found; skipping "
+            "(first_mysql_replica_setup must run first)",
+            SOURCE_NAME,
+        )
+        return 0
+
+    batch_id = f"p{helper_random.random_u64():016x}"
+    LOG.info("driver starting; batch_id=%s", batch_id)
+
+    expected = _insert_rows(batch_id)
+    if not expected:
+        LOG.info("no rows inserted successfully this invocation; exiting cleanly")
+        return 0
+
+    LOG.info("inserted %d rows; requesting quiet period", len(expected))
+    request_quiet_period(QUIET_PERIOD_S)
+
+    caught_up = _wait_for_catchup(batch_id, len(expected))
+
+    # Liveness anchor: at least one invocation should fully catch up. If this
+    # never fires across an entire run the safety assertions below are vacuous.
+    sometimes(
+        caught_up,
+        "mysql: CDC source caught up to all primary inserts after quiet period",
+        {
+            "source": TABLE_NAME,
+            "batch_id": batch_id,
+            "rows_inserted": len(expected),
+        },
+    )
+
+    if not caught_up:
+        # Don't run per-row safety assertions on stale data — a slow catchup
+        # is a separate concern from row-level correctness.
+        LOG.info("catchup did not complete in budget; skipping per-row assertions")
+        return 0
+
+    # Safety: every row we inserted must be present with the correct value.
+    _check_rows(expected)
+
+    # Count-level safety check: no extra rows for our batch_id should exist.
+    rows = query_retry(
+        f"SELECT COUNT(*)::bigint FROM {TABLE_NAME} WHERE batch_id = %s",
+        (batch_id,),
+    )
+    count_in_mz = int(rows[0][0]) if rows and rows[0][0] is not None else 0
+    always(
+        count_in_mz == len(expected),
+        "mysql: CDC source row count matches inserted count after catchup",
+        {
+            "source": TABLE_NAME,
+            "batch_id": batch_id,
+            "expected_count": len(expected),
+            "observed_count": count_in_mz,
+        },
+    )
+
+    LOG.info(
+        "driver done; asserted on %d rows for batch_id=%s", len(expected), batch_id
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From 8dedd7b9a42eae11e4a277cccb7d34154251830b Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Tue, 12 May 2026 14:23:29 -0400
Subject: [PATCH 32/65] test/antithesis: clusterd workers=4 per replica to
 exercise multi-worker thread pausing

---
 test/antithesis/config/.env                   | 24 +++++++++++++++----
 test/antithesis/config/docker-compose.yaml    |  9 ++++---
 test/antithesis/mzcompose.py                  | 14 ++++++++++-
 .../workload/workload-entrypoint.sh           |  4 ++--
 4 files changed, 39 insertions(+), 12 deletions(-)

diff --git a/test/antithesis/config/.env b/test/antithesis/config/.env
index 92cddafe58f23..d4f160a98596f 100644
--- a/test/antithesis/config/.env
+++ b/test/antithesis/config/.env
@@ -1,5 +1,21 @@
-# GENERATED FILE — do not edit. Regenerate via:
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+# Compose env-file for `test/antithesis/config/docker-compose.yaml`.
+# Tracked by git only so that the file exists for mzbuild's input
+# fingerprinting and survives `git clean -ffdX` between builds. The
+# committed values are placeholders — `build-antithesis.sh` overwrites
+# them in CI with refs to images pushed to Antithesis's GCP Artifact
+# Registry, and `make export-env` does the same with local-dev refs.
+#
+# If you see these placeholder values on a running cluster, your build
+# pipeline did not regenerate this file. Run:
 #   bin/pyactivate test/antithesis/export-env.py > test/antithesis/config/.env
-# Consumed by test/antithesis/config/docker-compose.yaml at compose-parse time.
-MATERIALIZED_IMAGE=ghcr.io/materializeinc/materialize/materialized:mzbuild-EMRA5ARAVQMKNFJIHZJTAPOEWMAGW5TX
-ANTITHESIS_WORKLOAD_IMAGE=ghcr.io/materializeinc/materialize/antithesis-workload:mzbuild-YKN4ZHJT7YAPYQLQV5BYTUPCRY2RUDRI
+MATERIALIZED_IMAGE=placeholder-not-built
+ANTITHESIS_WORKLOAD_IMAGE=placeholder-not-built
diff --git a/test/antithesis/config/docker-compose.yaml b/test/antithesis/config/docker-compose.yaml
index 97d982367e63b..b9c383be7f1a5 100644
--- a/test/antithesis/config/docker-compose.yaml
+++ b/test/antithesis/config/docker-compose.yaml
@@ -260,10 +260,10 @@ services:
     - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879
     - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0
     - CLUSTERD_PROCESS=0
-    - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 1, "process": 0, "addresses": ["clusterd1:2102"],
+    - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd1:2102"],
       "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
       false, "zero_copy_limit": null}'
-    - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 1, "process": 0, "addresses": ["clusterd1:2103"],
+    - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd1:2103"],
       "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
       false, "zero_copy_limit": null}'
     volumes:
@@ -299,10 +299,10 @@ services:
     - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879
     - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0
     - CLUSTERD_PROCESS=0
-    - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 1, "process": 0, "addresses": ["clusterd2:2102"],
+    - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd2:2102"],
       "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
       false, "zero_copy_limit": null}'
-    - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 1, "process": 0, "addresses": ["clusterd2:2103"],
+    - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd2:2103"],
       "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
       false, "zero_copy_limit": null}'
     volumes:
@@ -484,7 +484,6 @@ services:
     - MZ_NO_BUILTIN_POSTGRES=1
     - MZ_NO_BUILTIN_COCKROACH=1
     - MZ_ADAPTER_STASH_URL=postgres://root@postgres-metadata:26257?options=--search_path=adapter
-    - MZ_LICENSE_KEY=/license_key/license_key
     volumes:
     - mzdata:/mzdata
     - mydata:/var/lib/mysql-files
diff --git a/test/antithesis/mzcompose.py b/test/antithesis/mzcompose.py
index c799269cbd216..fbcfa2da9fbb5 100644
--- a/test/antithesis/mzcompose.py
+++ b/test/antithesis/mzcompose.py
@@ -116,9 +116,19 @@ def __init__(self) -> None:
     # Antithesis kill either replica's backing container without taking
     # the workload offline.
     #
+    # `workers=4` per clusterd means each replica runs four timely worker
+    # threads in one process. The extra intra-process parallelism is the
+    # surface area Antithesis's thread-pausing fault targets — with a
+    # single worker, "pause one thread" effectively pauses the whole
+    # process, which the container-pause fault already covers. The matching
+    # `WORKERS 4` in the CREATE CLUSTER REPLICAS statement must stay in
+    # lockstep with this value (it's read by the controller, not by
+    # clusterd).
+    #
     # Each clusterd MUST have its own /scratch volume — the upsert
     # operator's RocksDB state lives there and takes an exclusive file
-    # lock. The DEFAULT_MZ_VOLUMES list uses a single named volume
+    # lock per worker (`/scratch/storage/upsert/<id>/<worker>/LOCK`).
+    # The DEFAULT_MZ_VOLUMES list uses a single named volume
     # `scratch:/scratch` shared across containers; passing per-instance
     # named volumes (`clusterd1_scratch`, `clusterd2_scratch`) keeps the
     # locks separate while leaving the other volumes shared. Found via
@@ -128,6 +138,7 @@ def __init__(self) -> None:
     # corrupted the upsert state.
     Clusterd(
         name="clusterd1",
+        workers=4,
         volumes=[
             "mzdata:/mzdata",
             "mydata:/var/lib/mysql-files",
@@ -137,6 +148,7 @@ def __init__(self) -> None:
     ),
     Clusterd(
         name="clusterd2",
+        workers=4,
         volumes=[
             "mzdata:/mzdata",
             "mydata:/var/lib/mysql-files",
diff --git a/test/antithesis/workload/workload-entrypoint.sh b/test/antithesis/workload/workload-entrypoint.sh
index ce6e664a2c0de..1a8aab5234f51 100755
--- a/test/antithesis/workload/workload-entrypoint.sh
+++ b/test/antithesis/workload/workload-entrypoint.sh
@@ -48,14 +48,14 @@ CREATE CLUSTER ${CLUSTER} REPLICAS (
         STORAGE ADDRESSES ['clusterd1:2103'],
         COMPUTECTL ADDRESSES ['clusterd1:2101'],
         COMPUTE ADDRESSES ['clusterd1:2102'],
-        WORKERS 1
+        WORKERS 4
     ),
     replica2 (
         STORAGECTL ADDRESSES ['clusterd2:2100'],
         STORAGE ADDRESSES ['clusterd2:2103'],
         COMPUTECTL ADDRESSES ['clusterd2:2101'],
         COMPUTE ADDRESSES ['clusterd2:2102'],
-        WORKERS 1
+        WORKERS 4
     )
 );
 GRANT ALL ON CLUSTER ${CLUSTER} TO ${PGUSER};

From d56e33aa450cb4001ca2a025b3f68b889e64b3ec Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Tue, 12 May 2026 14:52:21 -0400
Subject: [PATCH 33/65] test/antithesis: drop
 --binlog_transaction_dependency_tracking; removed in MySQL 8.4 (WRITESET is
 the default)

---
 test/antithesis/config/docker-compose.yaml |  1 -
 test/antithesis/mzcompose.py               | 10 ++++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/test/antithesis/config/docker-compose.yaml b/test/antithesis/config/docker-compose.yaml
index b9c383be7f1a5..0a9c072b81aad 100644
--- a/test/antithesis/config/docker-compose.yaml
+++ b/test/antithesis/config/docker-compose.yaml
@@ -187,7 +187,6 @@ services:
     - --binlog-row-metadata=full
     - --server-id=1
     - --max-connections=500
-    - --binlog_transaction_dependency_tracking=WRITESET
     healthcheck:
       test:
       - CMD
diff --git a/test/antithesis/mzcompose.py b/test/antithesis/mzcompose.py
index fbcfa2da9fbb5..584f39852dd12 100644
--- a/test/antithesis/mzcompose.py
+++ b/test/antithesis/mzcompose.py
@@ -82,16 +82,18 @@ def __init__(self) -> None:
     Zookeeper(),
     Kafka(auto_create_topics=True),
     SchemaRegistry(),
-    # MySQL primary — GTID-enabled with WRITESET dependency tracking so the
-    # replica can safely use parallel workers without losing commit order.
+    # MySQL primary — GTID-enabled. WRITESET binlog dependency tracking
+    # is what lets the replica run parallel workers without losing commit
+    # order; in MySQL 8.4+ WRITESET is the default and the explicit knob
+    # was removed (`binlog_transaction_dependency_tracking` is unknown
+    # past 8.4, and the antithesis image is `mysql:9.5.0`).
     MySql(
         use_seeded_image=False,
         volumes=[
             "mysqldata_primary:/var/lib/mysql",
             "mydata:/var/lib/mysql-files",
         ],
-        additional_args=create_mysql_server_args(server_id="1", is_master=True)
-        + ["--binlog_transaction_dependency_tracking=WRITESET"],
+        additional_args=create_mysql_server_args(server_id="1", is_master=True),
     ),
     # MySQL replica — multithreaded replication (4 workers, commit-order
     # preserved).  Replication is configured at runtime by

From 445f452d0b64347f8240c48027c739a6a010b856 Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Tue, 12 May 2026 18:25:01 -0400
Subject: [PATCH 34/65] test/antithesis: drop --scratch-directory from clusterd
 so upsert RocksDB uses mem_env (production behavior)

---
 .../mzcompose/services/clusterd.py            | 10 ++++--
 test/antithesis/config/docker-compose.yaml    |  8 ++---
 test/antithesis/mzcompose.py                  | 34 ++++++-------------
 3 files changed, 21 insertions(+), 31 deletions(-)

diff --git a/misc/python/materialize/mzcompose/services/clusterd.py b/misc/python/materialize/mzcompose/services/clusterd.py
index e07ca490a5355..bffe3ddc3e470 100644
--- a/misc/python/materialize/mzcompose/services/clusterd.py
+++ b/misc/python/materialize/mzcompose/services/clusterd.py
@@ -28,7 +28,7 @@ def __init__(
         options: list[str] = [],
         restart: str = "no",
         stop_grace_period: str = "120s",
-        scratch_directory: str = "/scratch",
+        scratch_directory: str | None = "/scratch",
         volumes: list[str] = [],
         workers: int = 1,
         process_names: list[str] = [],
@@ -68,7 +68,13 @@ def __init__(
             f"CLUSTERD_STORAGE_TIMELY_CONFIG={storage_timely_config}",
         ]
 
-        options = ["clusterd", f"--scratch-directory={scratch_directory}", *options]
+        # `scratch_directory=None` omits the CLI flag entirely. clusterd
+        # treats this as "no scratch" — RocksDB switches to its in-memory
+        # env (`Env::mem_env()`), matching the production deployment shape
+        # where cluster replicas have no scratch disk attached.
+        options = ["clusterd", *options]
+        if scratch_directory is not None:
+            options.insert(1, f"--scratch-directory={scratch_directory}")
 
         config: ServiceConfig = {}
 
diff --git a/test/antithesis/config/docker-compose.yaml b/test/antithesis/config/docker-compose.yaml
index 0a9c072b81aad..446c9d0a189f6 100644
--- a/test/antithesis/config/docker-compose.yaml
+++ b/test/antithesis/config/docker-compose.yaml
@@ -241,7 +241,6 @@ services:
     - --
     command:
     - clusterd
-    - --scratch-directory=/scratch
     ports:
     - 2100
     - 2101
@@ -269,7 +268,7 @@ services:
     - mzdata:/mzdata
     - mydata:/var/lib/mysql-files
     - tmp:/share/tmp
-    - clusterd1_scratch:/scratch
+    - scratch:/scratch
     restart: 'no'
     stop_grace_period: 120s
     platform: linux/amd64
@@ -280,7 +279,6 @@ services:
     - --
     command:
     - clusterd
-    - --scratch-directory=/scratch
     ports:
     - 2100
     - 2101
@@ -308,7 +306,7 @@ services:
     - mzdata:/mzdata
     - mydata:/var/lib/mysql-files
     - tmp:/share/tmp
-    - clusterd2_scratch:/scratch
+    - scratch:/scratch
     restart: 'no'
     stop_grace_period: 120s
     platform: linux/amd64
@@ -548,5 +546,3 @@ volumes:
   scratch: null
   mysqldata_primary: null
   mysqldata_replica: null
-  clusterd1_scratch: null
-  clusterd2_scratch: null
diff --git a/test/antithesis/mzcompose.py b/test/antithesis/mzcompose.py
index 584f39852dd12..5f7da9d8f0e97 100644
--- a/test/antithesis/mzcompose.py
+++ b/test/antithesis/mzcompose.py
@@ -127,36 +127,24 @@ def __init__(self) -> None:
     # lockstep with this value (it's read by the controller, not by
     # clusterd).
     #
-    # Each clusterd MUST have its own /scratch volume — the upsert
-    # operator's RocksDB state lives there and takes an exclusive file
-    # lock per worker (`/scratch/storage/upsert/<id>/<worker>/LOCK`).
-    # The DEFAULT_MZ_VOLUMES list uses a single named volume
-    # `scratch:/scratch` shared across containers; passing per-instance
-    # named volumes (`clusterd1_scratch`, `clusterd2_scratch`) keeps the
-    # locks separate while leaving the other volumes shared. Found via
-    # an Antithesis run where clusterd1 deadlocked retrying to open
-    # `/scratch/storage/upsert/u3/0/LOCK` because clusterd2 held it,
-    # which then drove a continuous suspend-and-restart loop that
-    # corrupted the upsert state.
+    # `scratch_directory=None` matches production: cluster replicas in
+    # cloud deployments don't get a scratch disk, so the upsert operator's
+    # RocksDB initializes with `Env::mem_env()` and stores its state
+    # entirely in process memory. Passing a scratch directory would put
+    # us on a code path production never exercises, and would also
+    # require careful per-instance volume plumbing to avoid the two
+    # clusterds racing on the same `/scratch/storage/upsert/<id>/<w>/LOCK`
+    # file (which manifested as continuous Stalled/suspend-and-restart
+    # loops on clusterd1 in an earlier run).
     Clusterd(
         name="clusterd1",
         workers=4,
-        volumes=[
-            "mzdata:/mzdata",
-            "mydata:/var/lib/mysql-files",
-            "tmp:/share/tmp",
-            "clusterd1_scratch:/scratch",
-        ],
+        scratch_directory=None,
     ),
     Clusterd(
         name="clusterd2",
         workers=4,
-        volumes=[
-            "mzdata:/mzdata",
-            "mydata:/var/lib/mysql-files",
-            "tmp:/share/tmp",
-            "clusterd2_scratch:/scratch",
-        ],
+        scratch_directory=None,
     ),
     Materialized(
         external_blob_store=True,

From 492c30a6913a37197dd00aec5e555a4548feee19 Mon Sep 17 00:00:00 2001
From: Dennis Felsing <dennis@felsing.org>
Date: Wed, 13 May 2026 15:59:05 +0800
Subject: [PATCH 35/65] Not what we want, but what we deserve?

---
 .../parallel-workload-no-unexpected-errors.md |  44 +++
 .../scratchbook/property-catalog.md           |  20 +-
 .../test/parallel_driver_parallel_workload.py | 366 ++++++++++++++++++
 3 files changed, 429 insertions(+), 1 deletion(-)
 create mode 100644 test/antithesis/scratchbook/properties/parallel-workload-no-unexpected-errors.md
 create mode 100644 test/antithesis/workload/test/parallel_driver_parallel_workload.py

diff --git a/test/antithesis/scratchbook/properties/parallel-workload-no-unexpected-errors.md b/test/antithesis/scratchbook/properties/parallel-workload-no-unexpected-errors.md
new file mode 100644
index 0000000000000..c1e0f67dbc2a8
--- /dev/null
+++ b/test/antithesis/scratchbook/properties/parallel-workload-no-unexpected-errors.md
@@ -0,0 +1,44 @@
+# parallel-workload-no-unexpected-errors
+
+## Summary
+Randomized concurrent SQL against a shared pool of catalog objects should not
+produce unexpected query failures, even while Antithesis injects coordinator and
+replica faults.
+
+## Evidence
+
+### Code Paths
+- `src/adapter/src/coord/sequencer/` — concurrent DDL/DML sequencing and catalog transactions
+- `src/catalog/src/durable/` — catalog state persistence and recovery across restarts
+- `src/compute/src/` — materialized-view rendering and execution after concurrent DDL
+
+### How It Works
+The Antithesis workload uses a fixed shared schema and a small pool of tables
+and materialized views. Multiple worker threads repeatedly race `CREATE`,
+`DROP`, `INSERT`, `UPDATE`, `DELETE`, and `SELECT` against those objects. This
+deliberately forces the coordinator through concurrent catalog changes while the
+Antithesis fault injector pauses or restarts components underneath it.
+
+### What Goes Wrong on Violation
+Unexpected SQL failures here usually mean a concurrency bug in catalog
+sequencing, plan invalidation, or recovery. The workload already tolerates the
+expected race outcomes like "object was dropped" or "concurrent catalog
+modification"; what remains should be a real bug or an unclassified new failure
+mode worth triage.
+
+### Workload Verification
+1. Ensure the shared schema exists
+2. Spawn multiple worker threads
+3. Randomly issue DDL/DML/SELECT against a fixed object pool
+4. Count expected race/drop errors separately
+5. Assert that no other SQL error escapes
+
+### SUT-Side Instrumentation Notes
+- Best primary signal is workload-side because the interesting failures are
+  externally visible query errors, not one specific internal assertion site
+- Candidate follow-up: add targeted SUT-side assertions for catalog invalidation
+  and dropped-object dependency paths once a concrete failure mode is found
+
+### Provenance
+Adapted from the existing `test/parallel-workload/mzcompose.py` randomized SQL
+stress test into the Antithesis workload model.
diff --git a/test/antithesis/scratchbook/property-catalog.md b/test/antithesis/scratchbook/property-catalog.md
index 2c308cf3e2e2b..d4074c3bf7e2e 100644
--- a/test/antithesis/scratchbook/property-catalog.md
+++ b/test/antithesis/scratchbook/property-catalog.md
@@ -405,7 +405,25 @@ Properties specific to the Kafka source ingestion pipeline: `KafkaSourceReader`
 | **Antithesis Angle** | Inject persist consensus latency, kill+restart concurrently to create a competing writer, race the metadata fetcher's partition-add against a mint that is already in flight. The retry loop in `mint()` has no upper bound; this property confirms it is not livelocked even under adversarial schedules. |
 | **Why It Matters** | A livelocked mint loop manifests as a source that never advances its frontier — externally indistinguishable from a stalled Kafka consumer, but caused inside Materialize. |
 
-## Category 8: MySQL CDC Source
+## Category 8: Randomized Concurrency Stress
+
+Properties that use intentionally adversarial concurrent SQL workloads to flush
+out catalog, planning, and recovery bugs that are hard to encode as a single
+deterministic correctness scenario.
+
+### parallel-workload-no-unexpected-errors — Randomized Concurrent SQL Only Hits Expected Race Errors
+
+| | |
+|---|---|
+| **Type** | Safety |
+| **Priority** | P2 — broad regression net rather than one product contract, but good at finding real crashes and catalog races |
+| **Status** | **Implemented (workload-side)** — `test/antithesis/workload/test/parallel_driver_parallel_workload.py`. A shared schema plus four tables/four materialized views are stressed by multiple worker threads racing `CREATE`, `DROP`, `INSERT`, `UPDATE`, `DELETE`, and `SELECT`. The driver records `sometimes("parallel workload: randomized concurrent SQL executed successfully", …)` for liveness, `sometimes("parallel workload: DDL actions were exercised", …)` for coverage, `sometimes("parallel workload: expected concurrent-catalog races were observed", …)` to confirm the workload is hitting the intended contention paths, and one `always("parallel workload: no unexpected SQL errors escaped the randomized stress driver", …)` safety assertion for the failure signal itself. This is intentionally a subset port of `test/parallel-workload/mzcompose.py`, scoped to the existing Antithesis topology rather than the full mzcompose service matrix. |
+| **Property** | Under fault injection and concurrent randomized SQL, Materialize may return expected dropped-object / concurrent-catalog errors, but it must not surface *unexpected* query failures. |
+| **Invariant** | `Always`: every SQL exception raised by the randomized workload matches the driver's expected-concurrency ignore list; any uncategorized error is a property failure. |
+| **Antithesis Angle** | Antithesis can pause or restart environmentd/clusterd while several client threads concurrently create/drop objects and query them. The interesting windows are plan invalidation, catalog transaction races, and recovery of half-finished DDL. |
+| **Why It Matters** | This is a broad bug-finder for timing-sensitive failures that do not map cleanly to one narrow user contract but still produce visible query failures or crashes. It complements the more specific properties by covering the "something went wrong under concurrent SQL churn" space. |
+
+## Category 9: MySQL CDC Source
 
 Properties specific to Materialize's MySQL CDC source pipeline, which reads
 from a multithreaded MySQL replica. The topology adds a MySQL primary (GTID +
diff --git a/test/antithesis/workload/test/parallel_driver_parallel_workload.py b/test/antithesis/workload/test/parallel_driver_parallel_workload.py
new file mode 100644
index 0000000000000..46ece1c308341
--- /dev/null
+++ b/test/antithesis/workload/test/parallel_driver_parallel_workload.py
@@ -0,0 +1,366 @@
+#!/usr/bin/env python3
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Antithesis-native randomized parallel SQL workload.
+
+This ports the *intent* of `test/parallel-workload/mzcompose.py` into the
+existing Antithesis workload model without trying to ship the whole
+`materialize.parallel_workload` Python stack inside the workload image.
+
+The driver deliberately shares a small fixed pool of objects across all
+invocations and worker threads:
+  - one schema
+  - four tables
+  - four materialized views over those tables
+
+Workers race CREATE/DROP/INSERT/UPDATE/DELETE/SELECT against that pool. The
+property is not result correctness; it is that concurrent randomized SQL under
+fault injection should not surface *unexpected* query errors. Expected catalog
+race/drop errors are counted and ignored, mirroring the philosophy of the
+original parallel workload.
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+import random
+import sys
+import threading
+import time
+from collections import Counter
+from dataclasses import dataclass, field
+from typing import Any
+
+import helper_random
+import psycopg
+from helper_pg import PGDATABASE, PGHOST, PGPORT, PGUSER, execute_retry
+
+from antithesis.assertions import always, sometimes
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
+)
+LOG = logging.getLogger("driver.parallel_workload")
+
+CLUSTER = os.environ.get("MZ_ANTITHESIS_CLUSTER", "antithesis_cluster")
+SCHEMA = "antithesis_parallel_workload"
+
+TABLE_COUNT = 4
+WORKER_THREADS = 4
+RUNTIME_S = 25.0
+CONNECT_TIMEOUT_S = 5
+MAX_KEY = 31
+MAX_VALUE = 1000
+
+EXPECTED_ERROR_SUBSTRINGS = [
+    "already exists",
+    "does not exist",
+    "unknown catalog item",
+    "unknown schema",
+    "was dropped while executing a statement",
+    "another session modified the catalog while this DDL transaction was open",
+    "object state changed while transaction was in progress",
+    "query could not complete",
+    "cached plan must not change result type",
+    "the transaction's active cluster has been dropped",
+    "concurrent transaction",
+]
+
+
+@dataclass
+class WorkerStats:
+    successes: int = 0
+    reconnects: int = 0
+    ignored_errors: int = 0
+    actions: Counter[str] = field(default_factory=Counter)
+    ignored_by_reason: Counter[str] = field(default_factory=Counter)
+    unexpected: dict[str, Any] | None = None
+
+
+def table_name(idx: int) -> str:
+    return f"{SCHEMA}.t{idx}"
+
+
+def mv_name(idx: int) -> str:
+    return f"{SCHEMA}.mv{idx}"
+
+
+def ensure_shared_objects() -> None:
+    execute_retry(f"CREATE SCHEMA IF NOT EXISTS {SCHEMA}")
+    for idx in range(2):
+        execute_retry(
+            f"CREATE TABLE IF NOT EXISTS {table_name(idx)} ("
+            "worker TEXT NOT NULL, "
+            "k BIGINT NOT NULL, "
+            "v BIGINT NOT NULL"
+            ")"
+        )
+
+
+def connect() -> psycopg.Connection[Any]:
+    return psycopg.connect(
+        host=PGHOST,
+        port=PGPORT,
+        user=PGUSER,
+        dbname=PGDATABASE,
+        connect_timeout=CONNECT_TIMEOUT_S,
+        autocommit=True,
+    )
+
+
+def choose_action(rng: random.Random) -> str:
+    return rng.choices(
+        [
+            "create_table",
+            "drop_table",
+            "insert",
+            "update",
+            "delete",
+            "select_table",
+            "create_mv",
+            "drop_mv",
+            "select_mv",
+        ],
+        weights=[6, 2, 25, 12, 10, 20, 6, 2, 17],
+        k=1,
+    )[0]
+
+
+def execute_action(
+    conn: psycopg.Connection[Any], rng: random.Random, worker_name: str, action: str
+) -> None:
+    idx = rng.randrange(TABLE_COUNT)
+    table = table_name(idx)
+    mv = mv_name(idx)
+
+    with conn.cursor() as cur:
+        if action == "create_table":
+            cur.execute(
+                f"CREATE TABLE IF NOT EXISTS {table} ("
+                "worker TEXT NOT NULL, "
+                "k BIGINT NOT NULL, "
+                "v BIGINT NOT NULL"
+                ")"
+            )
+        elif action == "drop_table":
+            cur.execute(f"DROP TABLE IF EXISTS {table} CASCADE")
+        elif action == "insert":
+            cur.execute(
+                f"INSERT INTO {table} (worker, k, v) VALUES (%s, %s, %s)",
+                (
+                    worker_name,
+                    rng.randint(0, MAX_KEY),
+                    rng.randint(0, MAX_VALUE),
+                ),
+            )
+        elif action == "update":
+            cur.execute(
+                f"UPDATE {table} SET v = v + 1 WHERE k = %s",
+                (rng.randint(0, MAX_KEY),),
+            )
+        elif action == "delete":
+            cur.execute(
+                f"DELETE FROM {table} WHERE k = %s",
+                (rng.randint(0, MAX_KEY),),
+            )
+        elif action == "select_table":
+            cur.execute(
+                f"SELECT count(*)::bigint, min(v)::bigint, max(v)::bigint FROM {table}"
+            )
+            cur.fetchall()
+        elif action == "create_mv":
+            cur.execute(
+                f"CREATE MATERIALIZED VIEW IF NOT EXISTS {mv} "
+                f"IN CLUSTER {CLUSTER} AS "
+                f"SELECT worker, count(*)::bigint AS c, sum(v)::bigint AS s "
+                f"FROM {table} GROUP BY worker"
+            )
+        elif action == "drop_mv":
+            cur.execute(f"DROP MATERIALIZED VIEW IF EXISTS {mv}")
+        elif action == "select_mv":
+            cur.execute(
+                f"SELECT count(*)::bigint, sum(c)::bigint, sum(s)::bigint FROM {mv}"
+            )
+            cur.fetchall()
+        else:
+            raise ValueError(f"unknown action {action}")
+
+
+def expected_error_reason(exc: BaseException) -> str | None:
+    msg = str(exc)
+    for candidate in EXPECTED_ERROR_SUBSTRINGS:
+        if candidate in msg:
+            return candidate
+    return None
+
+
+def is_connection_error(exc: BaseException) -> bool:
+    return isinstance(exc, (psycopg.OperationalError, psycopg.InterfaceError))
+
+
+def run_worker(
+    worker_id: int,
+    seed: int,
+    deadline: float,
+    stop: threading.Event,
+    stats: WorkerStats,
+) -> None:
+    rng = random.Random(seed)
+    worker_name = f"pw{worker_id}"
+    conn: psycopg.Connection[Any] | None = None
+
+    try:
+        while time.monotonic() < deadline and not stop.is_set():
+            if conn is None or conn.closed:
+                try:
+                    conn = connect()
+                except Exception as exc:  # noqa: BLE001
+                    if not is_connection_error(exc):
+                        stats.unexpected = {
+                            "worker": worker_name,
+                            "action": "connect",
+                            "error": str(exc),
+                        }
+                        stop.set()
+                        return
+                    stats.reconnects += 1
+                    time.sleep(rng.uniform(0.05, 0.2))
+                    continue
+
+            action = choose_action(rng)
+            try:
+                execute_action(conn, rng, worker_name, action)
+                stats.successes += 1
+                stats.actions[action] += 1
+            except Exception as exc:  # noqa: BLE001
+                if is_connection_error(exc):
+                    stats.reconnects += 1
+                    try:
+                        conn.close()
+                    except Exception:  # noqa: BLE001
+                        pass
+                    conn = None
+                    continue
+
+                reason = expected_error_reason(exc)
+                if reason is not None:
+                    stats.ignored_errors += 1
+                    stats.ignored_by_reason[reason] += 1
+                    stats.actions[action] += 1
+                    continue
+
+                stats.unexpected = {
+                    "worker": worker_name,
+                    "action": action,
+                    "error": str(exc),
+                }
+                LOG.exception("unexpected parallel workload error")
+                stop.set()
+                return
+
+            time.sleep(rng.uniform(0.005, 0.05))
+    finally:
+        if conn is not None:
+            try:
+                conn.close()
+            except Exception:  # noqa: BLE001
+                pass
+
+
+def main() -> int:
+    ensure_shared_objects()
+
+    stop = threading.Event()
+    deadline = time.monotonic() + RUNTIME_S
+    seeds = [helper_random.random_u64() for _ in range(WORKER_THREADS)]
+    stats = [WorkerStats() for _ in range(WORKER_THREADS)]
+    threads = [
+        threading.Thread(
+            name=f"parallel-workload-{idx}",
+            target=run_worker,
+            args=(idx, seeds[idx], deadline, stop, stats[idx]),
+        )
+        for idx in range(WORKER_THREADS)
+    ]
+
+    LOG.info("parallel workload starting; schema=%s threads=%d", SCHEMA, WORKER_THREADS)
+    for thread in threads:
+        thread.start()
+    for thread in threads:
+        thread.join()
+
+    total_successes = sum(worker.successes for worker in stats)
+    total_reconnects = sum(worker.reconnects for worker in stats)
+    total_ignored = sum(worker.ignored_errors for worker in stats)
+    action_counts = Counter[str]()
+    ignored_by_reason = Counter[str]()
+    unexpected = next((worker.unexpected for worker in stats if worker.unexpected), None)
+    for worker in stats:
+        action_counts.update(worker.actions)
+        ignored_by_reason.update(worker.ignored_by_reason)
+
+    sometimes(
+        total_successes >= WORKER_THREADS * 5,
+        "parallel workload: randomized concurrent SQL executed successfully",
+        {
+            "successes": total_successes,
+            "threads": WORKER_THREADS,
+            "actions": dict(action_counts),
+            "reconnects": total_reconnects,
+        },
+    )
+    sometimes(
+        action_counts["create_table"]
+        + action_counts["drop_table"]
+        + action_counts["create_mv"]
+        + action_counts["drop_mv"]
+        > 0,
+        "parallel workload: DDL actions were exercised",
+        {
+            "create_table": action_counts["create_table"],
+            "drop_table": action_counts["drop_table"],
+            "create_mv": action_counts["create_mv"],
+            "drop_mv": action_counts["drop_mv"],
+        },
+    )
+    sometimes(
+        total_ignored > 0,
+        "parallel workload: expected concurrent-catalog races were observed",
+        {
+            "ignored_errors": total_ignored,
+            "ignored_by_reason": dict(ignored_by_reason),
+        },
+    )
+    always(
+        unexpected is None,
+        "parallel workload: no unexpected SQL errors escaped the randomized stress driver",
+        {
+            "unexpected": unexpected,
+            "successes": total_successes,
+            "ignored_errors": total_ignored,
+            "reconnects": total_reconnects,
+            "actions": dict(action_counts),
+        },
+    )
+
+    LOG.info(
+        "parallel workload done; successes=%d ignored=%d reconnects=%d unexpected=%s",
+        total_successes,
+        total_ignored,
+        total_reconnects,
+        unexpected,
+    )
+    return 1 if unexpected is not None else 0
+
+
+if __name__ == "__main__":
+    _ = (PGHOST, PGPORT, PGUSER, PGDATABASE, os)
+    sys.exit(main())

From 972732404989cd82bf722b0cd8d513ee8c0ac3dc Mon Sep 17 00:00:00 2001
From: Dennis Felsing <dennis@felsing.org>
Date: Wed, 13 May 2026 17:26:28 +0800
Subject: [PATCH 36/65] approach #2

---
 ci/test/build.py                              |   4 +-
 misc/python/materialize/mzbuild.py            |  14 +-
 .../test/antithesis/workload/Dockerfile       |  68 ++
 .../test/antithesis/workload/mzbuild.yml      |  39 ++
 test/antithesis/workload/.gitignore           |   3 +
 test/antithesis/workload/Dockerfile           |  30 +-
 test/antithesis/workload/mzbuild.yml          |  27 +
 .../stubs/materialize/mzcompose/__init__.py   |  37 ++
 .../materialize/mzcompose/composition.py      |  31 +
 .../materialize/mzcompose/helpers/__init__.py |   8 +
 .../materialize/mzcompose/helpers/iceberg.py  |  24 +
 .../mzcompose/services/__init__.py            |   8 +
 .../mzcompose/services/materialized.py        |  33 +
 .../materialize/mzcompose/services/minio.py   |  21 +
 .../materialize/mzcompose/services/mysql.py   |  22 +
 .../mzcompose/services/sql_server.py          |  24 +
 .../test/anytime_fault_recovery_exercised.py  |  22 +-
 .../test/anytime_kafka_frontier_monotonic.py  |   3 +-
 ..._kafka_offset_known_not_below_committed.py |   3 +-
 ...nytime_kafka_source_resumes_after_fault.py |   3 +-
 .../test/first_mysql_replica_setup.py         |   3 +-
 .../first_select_upsert_implementation.py     |   3 +-
 test/antithesis/workload/test/helper_mysql.py |  10 +-
 .../workload/test/helper_mysql_source.py      |   9 +-
 test/antithesis/workload/test/helper_pg.py    |  19 +-
 .../parallel_driver_kafka_none_envelope.py    |   3 +-
 ...rallel_driver_mv_reflects_table_updates.py |   3 +-
 .../test/parallel_driver_mysql_cdc.py         |   3 +-
 .../test/parallel_driver_parallel_workload.py | 609 +++++++++---------
 ...rallel_driver_strict_serializable_reads.py |  22 +-
 .../parallel_driver_upsert_latest_value.py    |   3 +-
 ...ton_driver_catalog_recovery_consistency.py |  40 +-
 ...ngleton_driver_upsert_state_rehydration.py |   3 +-
 33 files changed, 781 insertions(+), 373 deletions(-)
 create mode 100644 misc/python/test/antithesis/workload/Dockerfile
 create mode 100644 misc/python/test/antithesis/workload/mzbuild.yml
 create mode 100644 test/antithesis/workload/.gitignore
 create mode 100644 test/antithesis/workload/stubs/materialize/mzcompose/__init__.py
 create mode 100644 test/antithesis/workload/stubs/materialize/mzcompose/composition.py
 create mode 100644 test/antithesis/workload/stubs/materialize/mzcompose/helpers/__init__.py
 create mode 100644 test/antithesis/workload/stubs/materialize/mzcompose/helpers/iceberg.py
 create mode 100644 test/antithesis/workload/stubs/materialize/mzcompose/services/__init__.py
 create mode 100644 test/antithesis/workload/stubs/materialize/mzcompose/services/materialized.py
 create mode 100644 test/antithesis/workload/stubs/materialize/mzcompose/services/minio.py
 create mode 100644 test/antithesis/workload/stubs/materialize/mzcompose/services/mysql.py
 create mode 100644 test/antithesis/workload/stubs/materialize/mzcompose/services/sql_server.py

diff --git a/ci/test/build.py b/ci/test/build.py
index 95f4227afbaa7..89d9402aab08f 100755
--- a/ci/test/build.py
+++ b/ci/test/build.py
@@ -63,9 +63,7 @@ def main() -> None:
                 repo.images[name] for name in antithesis_images
             )
         else:
-            deps = repo.resolve_dependencies(
-                image for image in repo if image.publish
-            )
+            deps = repo.resolve_dependencies(image for image in repo if image.publish)
         deps.ensure(pre_build=lambda images: upload_debuginfo(repo, images))
         set_build_status("success")
         annotate_buildkite_with_tags(repo.rd.arch, deps)
diff --git a/misc/python/materialize/mzbuild.py b/misc/python/materialize/mzbuild.py
index 2200188139219..08ca9bb43c943 100644
--- a/misc/python/materialize/mzbuild.py
+++ b/misc/python/materialize/mzbuild.py
@@ -473,13 +473,21 @@ def __init__(self, rd: RepositoryDetails, path: Path, config: dict[str, Any]):
 
     def run(self, prep: Any) -> None:
         super().run(prep)
+        source = Path(self.source)
         for src in self.inputs():
-            dst = self.path / self.destination / src
+            rel = Path(src).relative_to(source)
+            dst = self.path / self.destination / rel
             dst.parent.mkdir(parents=True, exist_ok=True)
-            shutil.copy(self.rd.root / self.source / src, dst)
+            shutil.copy(self.rd.root / src, dst)
 
     def inputs(self) -> set[str]:
-        return set(git.expand_globs(self.rd.root / self.source, self.matching))
+        # Return repo-root-relative paths so that `ResolvedImage.fingerprint`
+        # (which resolves each input as `rd.root / rel_path`) can lstat them.
+        source = Path(self.source)
+        return {
+            str(source / p)
+            for p in git.expand_globs(self.rd.root / self.source, self.matching)
+        }
 
 
 class CargoPreImage(PreImage):
diff --git a/misc/python/test/antithesis/workload/Dockerfile b/misc/python/test/antithesis/workload/Dockerfile
new file mode 100644
index 0000000000000..513a8d75b5a04
--- /dev/null
+++ b/misc/python/test/antithesis/workload/Dockerfile
@@ -0,0 +1,68 @@
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+# Antithesis workload client for Materialize.
+#
+# Python-based test driver that connects to materialized via pgwire,
+# produces Kafka messages, and emits Antithesis assertions. The
+# parallel-workload driver reuses the real `materialize.parallel_workload`
+# Python package — see mzbuild.yml for the pre-image copy of the slice it
+# needs, and stubs/materialize/mzcompose/ for the docker-compose surface we
+# have to mock out.
+
+FROM python:3.12-slim-bookworm
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    curl \
+    postgresql-client \
+    && rm -rf /var/lib/apt/lists/*
+
+# `confluent-kafka[avro]` pulls fastavro, which `data_ingest.executor`
+# imports at module top via `confluent_kafka.schema_registry.avro`.
+# `pg8000`, `websocket-client`, `requests`, `xxhash`, `zstandard` cover the
+# rest of the module-load-time imports walking from `parallel_workload` →
+# `data_ingest` → `materialize.util`.
+RUN pip install --no-cache-dir \
+    psycopg[binary]==3.2.9 \
+    "confluent-kafka[avro]==2.8.0" \
+    antithesis==0.2.0 \
+    PyMySQL==1.1.1 \
+    pg8000==1.31.2 \
+    websocket-client==1.8.0 \
+    requests==2.32.3 \
+    xxhash==3.5.0 \
+    zstandard==0.23.0
+
+# setup-complete script
+COPY setup-complete.sh /usr/local/bin/setup-complete.sh
+RUN chmod +x /usr/local/bin/setup-complete.sh
+
+# Test template directory — populated by antithesis-workload skill later
+RUN mkdir -p /opt/antithesis/test/v1/materialize
+
+# Catalog directory for Python assertion cataloging
+RUN mkdir -p /opt/antithesis/catalog
+
+# Ship the `materialize.*` Python package needed by the parallel-workload
+# driver. Stubs are copied first so that the real parallel-workload code
+# layered on top can satisfy its top-level `from materialize.mzcompose...`
+# imports against tiny placeholders. `MZ_ROOT` is required by
+# `materialize/__init__.py` at import time — point it at the package root.
+COPY stubs/materialize/ /opt/antithesis-pkg/materialize/
+COPY materialize/ /opt/antithesis-pkg/materialize/
+ENV PYTHONPATH=/opt/antithesis-pkg
+ENV MZ_ROOT=/opt/antithesis-pkg
+
+# Copy test templates and entrypoint
+COPY test/ /opt/antithesis/test/v1/materialize/
+COPY workload-entrypoint.sh /usr/local/bin/workload-entrypoint.sh
+RUN chmod +x /usr/local/bin/workload-entrypoint.sh
+RUN chmod +x /opt/antithesis/test/v1/materialize/* 2>/dev/null || true
+
+ENTRYPOINT ["/usr/local/bin/workload-entrypoint.sh"]
diff --git a/misc/python/test/antithesis/workload/mzbuild.yml b/misc/python/test/antithesis/workload/mzbuild.yml
new file mode 100644
index 0000000000000..2d69faddfd065
--- /dev/null
+++ b/misc/python/test/antithesis/workload/mzbuild.yml
@@ -0,0 +1,39 @@
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+name: antithesis-workload
+
+# The parallel-workload driver reuses the real `materialize.parallel_workload`
+# Python package rather than reimplementing it. Copy the slice of
+# `misc/python/materialize/` it needs into the build context so the Dockerfile
+# can bundle it into the image. Everything in `materialize.mzcompose.*` is
+# replaced by tiny stubs (see `stubs/materialize/mzcompose/`) — Antithesis
+# injects faults at the container layer, so the workload container has no
+# docker-compose orchestration to call into.
+pre-image:
+  - type: copy
+    source: misc/python
+    destination: .
+    matching: materialize/__init__.py
+  - type: copy
+    source: misc/python
+    destination: .
+    matching: materialize/util.py
+  - type: copy
+    source: misc/python
+    destination: .
+    matching: materialize/sqlsmith.py
+  - type: copy
+    source: misc/python
+    destination: .
+    matching: materialize/parallel_workload
+  - type: copy
+    source: misc/python
+    destination: .
+    matching: materialize/data_ingest
diff --git a/test/antithesis/workload/.gitignore b/test/antithesis/workload/.gitignore
new file mode 100644
index 0000000000000..2c028d08d5e96
--- /dev/null
+++ b/test/antithesis/workload/.gitignore
@@ -0,0 +1,3 @@
+# Populated at image-build time by the `pre-image: type: copy` directives in
+# mzbuild.yml — committing them would diverge from the source tree.
+/materialize/
diff --git a/test/antithesis/workload/Dockerfile b/test/antithesis/workload/Dockerfile
index 5cca619ed8234..513a8d75b5a04 100644
--- a/test/antithesis/workload/Dockerfile
+++ b/test/antithesis/workload/Dockerfile
@@ -10,7 +10,11 @@
 # Antithesis workload client for Materialize.
 #
 # Python-based test driver that connects to materialized via pgwire,
-# produces Kafka messages, and emits Antithesis assertions.
+# produces Kafka messages, and emits Antithesis assertions. The
+# parallel-workload driver reuses the real `materialize.parallel_workload`
+# Python package — see mzbuild.yml for the pre-image copy of the slice it
+# needs, and stubs/materialize/mzcompose/ for the docker-compose surface we
+# have to mock out.
 
 FROM python:3.12-slim-bookworm
 
@@ -19,11 +23,21 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     postgresql-client \
     && rm -rf /var/lib/apt/lists/*
 
+# `confluent-kafka[avro]` pulls fastavro, which `data_ingest.executor`
+# imports at module top via `confluent_kafka.schema_registry.avro`.
+# `pg8000`, `websocket-client`, `requests`, `xxhash`, `zstandard` cover the
+# rest of the module-load-time imports walking from `parallel_workload` →
+# `data_ingest` → `materialize.util`.
 RUN pip install --no-cache-dir \
     psycopg[binary]==3.2.9 \
-    confluent-kafka==2.8.0 \
+    "confluent-kafka[avro]==2.8.0" \
     antithesis==0.2.0 \
-    PyMySQL==1.1.1
+    PyMySQL==1.1.1 \
+    pg8000==1.31.2 \
+    websocket-client==1.8.0 \
+    requests==2.32.3 \
+    xxhash==3.5.0 \
+    zstandard==0.23.0
 
 # setup-complete script
 COPY setup-complete.sh /usr/local/bin/setup-complete.sh
@@ -35,6 +49,16 @@ RUN mkdir -p /opt/antithesis/test/v1/materialize
 # Catalog directory for Python assertion cataloging
 RUN mkdir -p /opt/antithesis/catalog
 
+# Ship the `materialize.*` Python package needed by the parallel-workload
+# driver. Stubs are copied first so that the real parallel-workload code
+# layered on top can satisfy its top-level `from materialize.mzcompose...`
+# imports against tiny placeholders. `MZ_ROOT` is required by
+# `materialize/__init__.py` at import time — point it at the package root.
+COPY stubs/materialize/ /opt/antithesis-pkg/materialize/
+COPY materialize/ /opt/antithesis-pkg/materialize/
+ENV PYTHONPATH=/opt/antithesis-pkg
+ENV MZ_ROOT=/opt/antithesis-pkg
+
 # Copy test templates and entrypoint
 COPY test/ /opt/antithesis/test/v1/materialize/
 COPY workload-entrypoint.sh /usr/local/bin/workload-entrypoint.sh
diff --git a/test/antithesis/workload/mzbuild.yml b/test/antithesis/workload/mzbuild.yml
index f62b4c073bb00..b957b4f8a2046 100644
--- a/test/antithesis/workload/mzbuild.yml
+++ b/test/antithesis/workload/mzbuild.yml
@@ -8,3 +8,30 @@
 # by the Apache License, Version 2.0.
 
 name: antithesis-workload
+
+# The parallel-workload driver reuses the real `materialize.parallel_workload`
+# Python package rather than reimplementing it. Copy the needed slice of
+# `misc/python/materialize/` into the build context so the Dockerfile can
+# bundle it into the image. The list is intentionally narrow — everything
+# else is mocked out by the stubs in `stubs/materialize/mzcompose/...`.
+pre-image:
+  - type: copy
+    source: misc/python
+    destination: .
+    matching: materialize/__init__.py
+  - type: copy
+    source: misc/python
+    destination: .
+    matching: materialize/util.py
+  - type: copy
+    source: misc/python
+    destination: .
+    matching: materialize/sqlsmith.py
+  - type: copy
+    source: misc/python
+    destination: .
+    matching: materialize/parallel_workload/**
+  - type: copy
+    source: misc/python
+    destination: .
+    matching: materialize/data_ingest/**
diff --git a/test/antithesis/workload/stubs/materialize/mzcompose/__init__.py b/test/antithesis/workload/stubs/materialize/mzcompose/__init__.py
new file mode 100644
index 0000000000000..4896a7d403416
--- /dev/null
+++ b/test/antithesis/workload/stubs/materialize/mzcompose/__init__.py
@@ -0,0 +1,37 @@
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Stub of `materialize.mzcompose` for the Antithesis workload image.
+
+`materialize.parallel_workload` and `materialize.data_ingest` import
+`materialize.mzcompose` symbols at module load time even on code paths that
+don't actually run a docker-compose harness. The Antithesis workload image is
+a slim Python container with no docker/mzbuild toolchain, so we ship these
+stubs in its `PYTHONPATH` to satisfy the imports. Only attributes the
+parallel-workload driver hits at module top are provided; anything called at
+runtime in this environment would be a bug.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+
+def get_default_system_parameters() -> dict[str, str]:
+    return {}
+
+
+cluster_replica_size_map: dict[str, Any] = {}
+
+
+class _LoaderModule:
+    pass
+
+
+loader = _LoaderModule()
diff --git a/test/antithesis/workload/stubs/materialize/mzcompose/composition.py b/test/antithesis/workload/stubs/materialize/mzcompose/composition.py
new file mode 100644
index 0000000000000..4e0fff97fbbcd
--- /dev/null
+++ b/test/antithesis/workload/stubs/materialize/mzcompose/composition.py
@@ -0,0 +1,31 @@
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Stub of `materialize.mzcompose.composition`. See package __init__.py."""
+
+from __future__ import annotations
+
+from typing import Any
+
+
+class Composition:
+    """Placeholder type so that `Composition | None` annotations resolve.
+
+    Every code path in `parallel_workload` that calls methods on a Composition
+    is gated on `Scenario.{Kill,BackupRestore,ZeroDowntimeDeploy}` — none of
+    which the Antithesis driver selects. Instantiating one in this
+    environment is a programming error.
+    """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        raise RuntimeError(
+            "materialize.mzcompose.composition.Composition is stubbed in the "
+            "Antithesis workload image; Antithesis injects faults at the "
+            "container layer, so docker-compose orchestration is unavailable."
+        )
diff --git a/test/antithesis/workload/stubs/materialize/mzcompose/helpers/__init__.py b/test/antithesis/workload/stubs/materialize/mzcompose/helpers/__init__.py
new file mode 100644
index 0000000000000..caae679255ee1
--- /dev/null
+++ b/test/antithesis/workload/stubs/materialize/mzcompose/helpers/__init__.py
@@ -0,0 +1,8 @@
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
diff --git a/test/antithesis/workload/stubs/materialize/mzcompose/helpers/iceberg.py b/test/antithesis/workload/stubs/materialize/mzcompose/helpers/iceberg.py
new file mode 100644
index 0000000000000..eddc6d93231e5
--- /dev/null
+++ b/test/antithesis/workload/stubs/materialize/mzcompose/helpers/iceberg.py
@@ -0,0 +1,24 @@
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Stub of `materialize.mzcompose.helpers.iceberg`. See package __init__.py."""
+
+from __future__ import annotations
+
+from typing import Any
+
+
+def setup_polaris_for_iceberg(c: Any, *args: Any, **kwargs: Any) -> tuple[str, str]:
+    # `Database.create` calls this unconditionally. The Antithesis topology
+    # does not run Polaris; the driver overrides `Database.create` to skip the
+    # iceberg connection setup, so this function should never be reached.
+    raise RuntimeError(
+        "setup_polaris_for_iceberg() stub: iceberg sinks are not supported "
+        "inside the Antithesis workload container."
+    )
diff --git a/test/antithesis/workload/stubs/materialize/mzcompose/services/__init__.py b/test/antithesis/workload/stubs/materialize/mzcompose/services/__init__.py
new file mode 100644
index 0000000000000..caae679255ee1
--- /dev/null
+++ b/test/antithesis/workload/stubs/materialize/mzcompose/services/__init__.py
@@ -0,0 +1,8 @@
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
diff --git a/test/antithesis/workload/stubs/materialize/mzcompose/services/materialized.py b/test/antithesis/workload/stubs/materialize/mzcompose/services/materialized.py
new file mode 100644
index 0000000000000..a7d436724ace6
--- /dev/null
+++ b/test/antithesis/workload/stubs/materialize/mzcompose/services/materialized.py
@@ -0,0 +1,33 @@
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Stub of `materialize.mzcompose.services.materialized`. See package
+__init__.py for context."""
+
+from __future__ import annotations
+
+from enum import Enum
+from typing import Any
+
+LEADER_STATUS_HEALTHCHECK: list[str] = []
+
+
+class DeploymentStatus(Enum):
+    READY_TO_PROMOTE = "ready_to_promote"
+    IS_LEADER = "is_leader"
+
+
+class Materialized:
+    """Placeholder; only instantiated by `ZeroDowntimeDeployAction`."""
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        raise RuntimeError(
+            "Materialized service stub: zero-downtime-deploy is not "
+            "supported inside the Antithesis workload container."
+        )
diff --git a/test/antithesis/workload/stubs/materialize/mzcompose/services/minio.py b/test/antithesis/workload/stubs/materialize/mzcompose/services/minio.py
new file mode 100644
index 0000000000000..07ee119c96d48
--- /dev/null
+++ b/test/antithesis/workload/stubs/materialize/mzcompose/services/minio.py
@@ -0,0 +1,21 @@
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Stub of `materialize.mzcompose.services.minio`. See package __init__.py."""
+
+from __future__ import annotations
+
+
+def minio_blob_uri() -> str:
+    # Only referenced from BackupRestoreAction, which the Antithesis driver
+    # never schedules.
+    raise RuntimeError(
+        "minio_blob_uri() stub: BackupRestore scenario is not supported "
+        "inside the Antithesis workload container."
+    )
diff --git a/test/antithesis/workload/stubs/materialize/mzcompose/services/mysql.py b/test/antithesis/workload/stubs/materialize/mzcompose/services/mysql.py
new file mode 100644
index 0000000000000..1aeb60be61c16
--- /dev/null
+++ b/test/antithesis/workload/stubs/materialize/mzcompose/services/mysql.py
@@ -0,0 +1,22 @@
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Stub of `materialize.mzcompose.services.mysql`. See package __init__.py.
+
+Only the `DEFAULT_ROOT_PASSWORD` class attribute is read at runtime — the
+constant must match the real `MySql` service so the parallel-workload's
+`CREATE SECRET mypass AS ...` matches the actual MySQL container password
+provisioned by the Antithesis topology.
+"""
+
+from __future__ import annotations
+
+
+class MySql:
+    DEFAULT_ROOT_PASSWORD = "p@ssw0rd"
diff --git a/test/antithesis/workload/stubs/materialize/mzcompose/services/sql_server.py b/test/antithesis/workload/stubs/materialize/mzcompose/services/sql_server.py
new file mode 100644
index 0000000000000..023cde11b9bbd
--- /dev/null
+++ b/test/antithesis/workload/stubs/materialize/mzcompose/services/sql_server.py
@@ -0,0 +1,24 @@
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Stub of `materialize.mzcompose.services.sql_server`. See package __init__.py.
+
+Constants kept in sync with the real `SqlServer` service so any SQL emitted
+referring to them stays well-formed. The Antithesis topology doesn't actually
+include a sql-server container — driver code avoids
+`CreateSqlServerSourceAction` and overrides the connection setup in
+`Database.create` accordingly.
+"""
+
+from __future__ import annotations
+
+
+class SqlServer:
+    DEFAULT_USER = "SA"
+    DEFAULT_SA_PASSWORD = "RPSsql12345"
diff --git a/test/antithesis/workload/test/anytime_fault_recovery_exercised.py b/test/antithesis/workload/test/anytime_fault_recovery_exercised.py
index 143dd8c103dce..ff90867b0b6f5 100755
--- a/test/antithesis/workload/test/anytime_fault_recovery_exercised.py
+++ b/test/antithesis/workload/test/anytime_fault_recovery_exercised.py
@@ -45,6 +45,7 @@
 import time
 
 import psycopg
+from antithesis.assertions import sometimes
 from helper_pg import (
     PGDATABASE,
     PGHOST,
@@ -53,8 +54,6 @@
     query_one_retry,
 )
 
-from antithesis.assertions import sometimes
-
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
 )
@@ -75,14 +74,17 @@ def _probe_select_one() -> bool:
     the recovery transition we are looking for.
     """
     try:
-        with psycopg.connect(
-            host=PGHOST,
-            port=PGPORT,
-            user=PGUSER,
-            dbname=PGDATABASE,
-            connect_timeout=int(PROBE_CONNECT_TIMEOUT_S),
-            autocommit=True,
-        ) as conn, conn.cursor() as cur:
+        with (
+            psycopg.connect(
+                host=PGHOST,
+                port=PGPORT,
+                user=PGUSER,
+                dbname=PGDATABASE,
+                connect_timeout=int(PROBE_CONNECT_TIMEOUT_S),
+                autocommit=True,
+            ) as conn,
+            conn.cursor() as cur,
+        ):
             cur.execute("SELECT 1")
             row = cur.fetchone()
             return row is not None and row[0] == 1
diff --git a/test/antithesis/workload/test/anytime_kafka_frontier_monotonic.py b/test/antithesis/workload/test/anytime_kafka_frontier_monotonic.py
index faee0fd0c680e..efd906a725844 100755
--- a/test/antithesis/workload/test/anytime_kafka_frontier_monotonic.py
+++ b/test/antithesis/workload/test/anytime_kafka_frontier_monotonic.py
@@ -42,11 +42,10 @@
 import sys
 import time
 
+from antithesis.assertions import always
 from helper_pg import query_retry
 from helper_source_stats import offset_committed
 
-from antithesis.assertions import always
-
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
 )
diff --git a/test/antithesis/workload/test/anytime_kafka_offset_known_not_below_committed.py b/test/antithesis/workload/test/anytime_kafka_offset_known_not_below_committed.py
index 9801c4dfa65b7..a8d6be62ae6a9 100755
--- a/test/antithesis/workload/test/anytime_kafka_offset_known_not_below_committed.py
+++ b/test/antithesis/workload/test/anytime_kafka_offset_known_not_below_committed.py
@@ -40,9 +40,8 @@
 import sys
 import time
 
-from helper_pg import query_retry
-
 from antithesis.assertions import always
+from helper_pg import query_retry
 
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
diff --git a/test/antithesis/workload/test/anytime_kafka_source_resumes_after_fault.py b/test/antithesis/workload/test/anytime_kafka_source_resumes_after_fault.py
index 85042a317d7cb..b453f62631aac 100755
--- a/test/antithesis/workload/test/anytime_kafka_source_resumes_after_fault.py
+++ b/test/antithesis/workload/test/anytime_kafka_source_resumes_after_fault.py
@@ -50,9 +50,8 @@
 import sys
 import time
 
-from helper_pg import query_one_retry, query_retry
-
 from antithesis.assertions import reachable, sometimes
+from helper_pg import query_one_retry, query_retry
 
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
diff --git a/test/antithesis/workload/test/first_mysql_replica_setup.py b/test/antithesis/workload/test/first_mysql_replica_setup.py
index 4380b5f4bd40d..ee603e60e88d6 100644
--- a/test/antithesis/workload/test/first_mysql_replica_setup.py
+++ b/test/antithesis/workload/test/first_mysql_replica_setup.py
@@ -30,9 +30,8 @@
 import time
 
 import helper_mysql
-from helper_mysql_source import ensure_mysql_cdc_source
-
 from antithesis.assertions import reachable, sometimes
+from helper_mysql_source import ensure_mysql_cdc_source
 
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
diff --git a/test/antithesis/workload/test/first_select_upsert_implementation.py b/test/antithesis/workload/test/first_select_upsert_implementation.py
index 03394a1ebd7f7..584f40da7812c 100755
--- a/test/antithesis/workload/test/first_select_upsert_implementation.py
+++ b/test/antithesis/workload/test/first_select_upsert_implementation.py
@@ -29,9 +29,8 @@
 import sys
 
 import helper_random
-from helper_pg import execute_internal_retry
-
 from antithesis.assertions import sometimes
+from helper_pg import execute_internal_retry
 
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
diff --git a/test/antithesis/workload/test/helper_mysql.py b/test/antithesis/workload/test/helper_mysql.py
index e99b3656cb4dd..f9b79395c556a 100644
--- a/test/antithesis/workload/test/helper_mysql.py
+++ b/test/antithesis/workload/test/helper_mysql.py
@@ -36,7 +36,7 @@
 
 
 def _retryable(exc: BaseException) -> bool:
-    return isinstance(exc, (pymysql.OperationalError, pymysql.InterfaceError))
+    return isinstance(exc, pymysql.OperationalError | pymysql.InterfaceError)
 
 
 def _open(host: str, database: str) -> pymysql.connections.Connection:
@@ -116,16 +116,12 @@ def execute_replica(sql: str, params: tuple = (), database: str = "mysql") -> No
     _execute(MYSQL_REPLICA_HOST, sql, params, database)
 
 
-def query_primary(
-    sql: str, params: tuple = (), database: str = "mysql"
-) -> list[tuple]:
+def query_primary(sql: str, params: tuple = (), database: str = "mysql") -> list[tuple]:
     """Run a query on the MySQL primary and return all rows."""
     return _query(MYSQL_HOST, sql, params, database)
 
 
-def query_replica(
-    sql: str, params: tuple = (), database: str = "mysql"
-) -> list[tuple]:
+def query_replica(sql: str, params: tuple = (), database: str = "mysql") -> list[tuple]:
     """Run a query on the MySQL replica and return all rows."""
     return _query(MYSQL_REPLICA_HOST, sql, params, database)
 
diff --git a/test/antithesis/workload/test/helper_mysql_source.py b/test/antithesis/workload/test/helper_mysql_source.py
index 6572eddc9c7e4..34323a846faed 100644
--- a/test/antithesis/workload/test/helper_mysql_source.py
+++ b/test/antithesis/workload/test/helper_mysql_source.py
@@ -28,7 +28,6 @@
 import os
 
 import psycopg
-
 from helper_pg import create_source_idempotent, execute_retry, query_retry
 
 LOG = logging.getLogger("antithesis.helper_mysql_source")
@@ -48,9 +47,7 @@
 
 def ensure_mysql_connection() -> None:
     """Create the MySQL secret and connection in Materialize (idempotent)."""
-    execute_retry(
-        f"CREATE SECRET IF NOT EXISTS {SECRET_NAME} AS '{MYSQL_PASSWORD}'"
-    )
+    execute_retry(f"CREATE SECRET IF NOT EXISTS {SECRET_NAME} AS '{MYSQL_PASSWORD}'")
     execute_retry(
         f"CREATE CONNECTION IF NOT EXISTS {CONNECTION_NAME} TO MYSQL ("
         f"HOST '{MYSQL_REPLICA_HOST}', "
@@ -58,7 +55,9 @@ def ensure_mysql_connection() -> None:
         f"PASSWORD SECRET {SECRET_NAME}"
         f")"
     )
-    LOG.info("mysql connection %s ready (replica=%s)", CONNECTION_NAME, MYSQL_REPLICA_HOST)
+    LOG.info(
+        "mysql connection %s ready (replica=%s)", CONNECTION_NAME, MYSQL_REPLICA_HOST
+    )
 
 
 def ensure_mysql_cdc_table() -> None:
diff --git a/test/antithesis/workload/test/helper_pg.py b/test/antithesis/workload/test/helper_pg.py
index 5c74276fe5f90..e3508c4f44b4a 100644
--- a/test/antithesis/workload/test/helper_pg.py
+++ b/test/antithesis/workload/test/helper_pg.py
@@ -142,14 +142,17 @@ def execute_internal_retry(sql: str, params: Sequence[Any] | None = None) -> Non
     backoff = _RETRY_INITIAL_S
     while True:
         try:
-            with psycopg.connect(
-                host=PGHOST,
-                port=PGPORT_INTERNAL,
-                user=PGUSER_INTERNAL,
-                dbname=PGDATABASE,
-                connect_timeout=_CONNECT_TIMEOUT_S,
-                autocommit=True,
-            ) as conn, conn.cursor() as cur:
+            with (
+                psycopg.connect(
+                    host=PGHOST,
+                    port=PGPORT_INTERNAL,
+                    user=PGUSER_INTERNAL,
+                    dbname=PGDATABASE,
+                    connect_timeout=_CONNECT_TIMEOUT_S,
+                    autocommit=True,
+                ) as conn,
+                conn.cursor() as cur,
+            ):
                 cur.execute(sql, params or ())
             return
         except Exception as exc:  # noqa: BLE001
diff --git a/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py b/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py
index 9c3c0e2461cbe..3c1a4e1359793 100755
--- a/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py
+++ b/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py
@@ -45,6 +45,7 @@
 import sys
 
 import helper_random
+from antithesis.assertions import always, sometimes
 from helper_kafka import make_producer
 from helper_none_source import (
     SOURCE_NONE_TEXT,
@@ -55,8 +56,6 @@
 from helper_quiet import request_quiet_period
 from helper_source_stats import wait_for_catchup
 
-from antithesis.assertions import always, sometimes
-
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
 )
diff --git a/test/antithesis/workload/test/parallel_driver_mv_reflects_table_updates.py b/test/antithesis/workload/test/parallel_driver_mv_reflects_table_updates.py
index c026be09ea522..876f5ff5a8e5e 100755
--- a/test/antithesis/workload/test/parallel_driver_mv_reflects_table_updates.py
+++ b/test/antithesis/workload/test/parallel_driver_mv_reflects_table_updates.py
@@ -45,12 +45,11 @@
 import time
 
 import helper_random
+from antithesis.assertions import always, sometimes
 from helper_pg import execute_retry, query_one_retry
 from helper_quiet import request_quiet_period
 from helper_table_mv import MV_NAME, TABLE_MV_INPUT, ensure_table_and_mv
 
-from antithesis.assertions import always, sometimes
-
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
 )
diff --git a/test/antithesis/workload/test/parallel_driver_mysql_cdc.py b/test/antithesis/workload/test/parallel_driver_mysql_cdc.py
index 67a9627e1e386..c51330251bad8 100644
--- a/test/antithesis/workload/test/parallel_driver_mysql_cdc.py
+++ b/test/antithesis/workload/test/parallel_driver_mysql_cdc.py
@@ -39,12 +39,11 @@
 
 import helper_mysql
 import helper_random
+from antithesis.assertions import always, sometimes
 from helper_mysql_source import SOURCE_NAME, TABLE_NAME
 from helper_pg import query_retry
 from helper_quiet import request_quiet_period
 
-from antithesis.assertions import always, sometimes
-
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
 )
diff --git a/test/antithesis/workload/test/parallel_driver_parallel_workload.py b/test/antithesis/workload/test/parallel_driver_parallel_workload.py
index 46ece1c308341..1c8dadf8f641d 100644
--- a/test/antithesis/workload/test/parallel_driver_parallel_workload.py
+++ b/test/antithesis/workload/test/parallel_driver_parallel_workload.py
@@ -8,23 +8,32 @@
 # the Business Source License, use of this software will be governed
 # by the Apache License, Version 2.0.
 
-"""Antithesis-native randomized parallel SQL workload.
-
-This ports the *intent* of `test/parallel-workload/mzcompose.py` into the
-existing Antithesis workload model without trying to ship the whole
-`materialize.parallel_workload` Python stack inside the workload image.
-
-The driver deliberately shares a small fixed pool of objects across all
-invocations and worker threads:
-  - one schema
-  - four tables
-  - four materialized views over those tables
-
-Workers race CREATE/DROP/INSERT/UPDATE/DELETE/SELECT against that pool. The
-property is not result correctness; it is that concurrent randomized SQL under
-fault injection should not surface *unexpected* query errors. Expected catalog
-race/drop errors are counted and ignored, mirroring the philosophy of the
-original parallel workload.
+"""Antithesis driver wrapping the real `materialize.parallel_workload`.
+
+Earlier versions of this file reimplemented the *idea* of parallel-workload
+(a fixed pool of objects, worker threads racing CREATE/DROP/INSERT/etc.).
+That diverged from the canonical stress driver and forced us to rederive the
+catalog-race error catalog by hand. This module instead bundles the real
+`materialize.parallel_workload` package into the workload image (see
+`mzbuild.yml` + `Dockerfile`) and invokes its `Worker`, `Action`,
+`ActionList`, and `Database` classes directly.
+
+A few pieces of upstream's `parallel_workload.run()` orchestration don't
+translate to the Antithesis topology:
+
+  * Faults are injected at the container layer by Antithesis itself, so we
+    don't spawn `KillAction`/`BackupRestoreAction`/`ZeroDowntimeDeployAction`
+    worker threads. We still tag the database with `Scenario.Kill` so each
+    `Action.errors_to_ignore` includes connection-shaped errors — those are
+    expected here.
+  * `Database.create` unconditionally calls `setup_polaris_for_iceberg(...)`
+    and creates `postgres_conn` / `sql_server_conn` against services that
+    aren't in the Antithesis compose. We override `create` to skip that
+    setup and only wire up the kafka + minio connections the topology
+    actually has.
+  * `parallel_workload.run()` tunes a long list of `ALTER SYSTEM SET` knobs
+    and recreates the `quickstart` cluster. We skip the recreate (would
+    fight with `antithesis_cluster`) and apply only the size-limit knobs.
 """
 
 from __future__ import annotations
@@ -35,332 +44,350 @@
 import sys
 import threading
 import time
-from collections import Counter
-from dataclasses import dataclass, field
 from typing import Any
 
 import helper_random
 import psycopg
-from helper_pg import PGDATABASE, PGHOST, PGPORT, PGUSER, execute_retry
-
 from antithesis.assertions import always, sometimes
+from helper_pg import (
+    PGDATABASE,
+    PGHOST,
+    PGPORT,
+    PGPORT_INTERNAL,
+    PGUSER,
+    PGUSER_INTERNAL,
+)
+
+from materialize.parallel_workload.action import (
+    ddl_action_list,
+    dml_nontrans_action_list,
+    fetch_action_list,
+    read_action_list,
+    write_action_list,
+)
+from materialize.parallel_workload.database import (
+    MAX_CLUSTER_REPLICAS,
+    MAX_CLUSTERS,
+    MAX_KAFKA_SINKS,
+    MAX_KAFKA_SOURCES,
+    MAX_POSTGRES_SOURCES,
+    MAX_ROLES,
+    MAX_SCHEMAS,
+    MAX_TABLES,
+    MAX_VIEWS,
+    MAX_WEBHOOK_SOURCES,
+    Database,
+)
+from materialize.parallel_workload.executor import Executor
+from materialize.parallel_workload.settings import Complexity, Scenario
+from materialize.parallel_workload.worker import Worker
+from materialize.parallel_workload.worker_exception import WorkerFailedException
 
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
 )
 LOG = logging.getLogger("driver.parallel_workload")
 
-CLUSTER = os.environ.get("MZ_ANTITHESIS_CLUSTER", "antithesis_cluster")
-SCHEMA = "antithesis_parallel_workload"
-
-TABLE_COUNT = 4
-WORKER_THREADS = 4
-RUNTIME_S = 25.0
-CONNECT_TIMEOUT_S = 5
-MAX_KEY = 31
-MAX_VALUE = 1000
-
-EXPECTED_ERROR_SUBSTRINGS = [
-    "already exists",
-    "does not exist",
-    "unknown catalog item",
-    "unknown schema",
-    "was dropped while executing a statement",
-    "another session modified the catalog while this DDL transaction was open",
-    "object state changed while transaction was in progress",
-    "query could not complete",
-    "cached plan must not change result type",
-    "the transaction's active cluster has been dropped",
-    "concurrent transaction",
-]
-
-
-@dataclass
-class WorkerStats:
-    successes: int = 0
-    reconnects: int = 0
-    ignored_errors: int = 0
-    actions: Counter[str] = field(default_factory=Counter)
-    ignored_by_reason: Counter[str] = field(default_factory=Counter)
-    unexpected: dict[str, Any] | None = None
-
-
-def table_name(idx: int) -> str:
-    return f"{SCHEMA}.t{idx}"
-
-
-def mv_name(idx: int) -> str:
-    return f"{SCHEMA}.mv{idx}"
-
-
-def ensure_shared_objects() -> None:
-    execute_retry(f"CREATE SCHEMA IF NOT EXISTS {SCHEMA}")
-    for idx in range(2):
-        execute_retry(
-            f"CREATE TABLE IF NOT EXISTS {table_name(idx)} ("
-            "worker TEXT NOT NULL, "
-            "k BIGINT NOT NULL, "
-            "v BIGINT NOT NULL"
-            ")"
+# Antithesis Test Composer invokes drivers in tight loops, so this script is
+# intentionally short. The cap exists so a single iteration can't monopolise
+# the fault-injection budget; the goal is repeated short bursts.
+RUNTIME_S = float(os.environ.get("PW_RUNTIME_S", "20"))
+NUM_THREADS = int(os.environ.get("PW_THREADS", "4"))
+
+
+def _alter_system(cur: psycopg.Cursor[Any], stmt: str) -> None:
+    try:
+        cur.execute(stmt.encode())
+    except Exception as exc:  # noqa: BLE001
+        LOG.warning("ALTER SYSTEM tolerated: %s (%s)", stmt, exc)
+
+
+def _prepare_system(num_threads: int) -> None:
+    """Apply the catalog-size knobs from `parallel_workload.run()` so the
+    workload doesn't trip default limits. The privilege grants mirror upstream
+    so most queries don't fail on permissions. Idempotent across drivers."""
+    with (
+        psycopg.connect(
+            host=PGHOST,
+            port=PGPORT_INTERNAL,
+            user=PGUSER_INTERNAL,
+            dbname=PGDATABASE,
+            autocommit=True,
+            connect_timeout=15,
+        ) as conn,
+        conn.cursor() as cur,
+    ):
+        _alter_system(
+            cur,
+            f"ALTER SYSTEM SET max_schemas_per_database = {MAX_SCHEMAS * 40 + num_threads}",
+        )
+        _alter_system(
+            cur, f"ALTER SYSTEM SET max_tables = {MAX_TABLES * 40 + num_threads}"
+        )
+        _alter_system(
+            cur,
+            f"ALTER SYSTEM SET max_materialized_views = {MAX_VIEWS * 40 + num_threads}",
+        )
+        _alter_system(
+            cur,
+            f"ALTER SYSTEM SET max_sources = "
+            f"{(MAX_WEBHOOK_SOURCES + MAX_KAFKA_SOURCES + MAX_POSTGRES_SOURCES) * 40 + num_threads}",
+        )
+        _alter_system(
+            cur, f"ALTER SYSTEM SET max_sinks = {MAX_KAFKA_SINKS * 40 + num_threads}"
+        )
+        _alter_system(
+            cur, f"ALTER SYSTEM SET max_roles = {MAX_ROLES * 1000 + num_threads}"
+        )
+        _alter_system(
+            cur, f"ALTER SYSTEM SET max_clusters = {MAX_CLUSTERS * 40 + num_threads}"
         )
+        _alter_system(
+            cur,
+            f"ALTER SYSTEM SET max_replicas_per_cluster = "
+            f"{MAX_CLUSTER_REPLICAS * 40 + num_threads}",
+        )
+        _alter_system(cur, "ALTER SYSTEM SET max_secrets = 1000000")
+        _alter_system(cur, "ALTER SYSTEM SET idle_in_transaction_session_timeout = 0")
+        for object_type in (
+            "TABLES",
+            "TYPES",
+            "SECRETS",
+            "CONNECTIONS",
+            "DATABASES",
+            "SCHEMAS",
+            "CLUSTERS",
+        ):
+            _alter_system(
+                cur,
+                f"ALTER DEFAULT PRIVILEGES FOR ALL ROLES "
+                f"GRANT ALL PRIVILEGES ON {object_type} TO PUBLIC",
+            )
 
 
-def connect() -> psycopg.Connection[Any]:
-    return psycopg.connect(
-        host=PGHOST,
-        port=PGPORT,
-        user=PGUSER,
-        dbname=PGDATABASE,
-        connect_timeout=CONNECT_TIMEOUT_S,
-        autocommit=True,
-    )
+def _create_database_for_antithesis(database: Database, exe: Executor) -> None:
+    """Stand-in for `Database.create` that only sets up connections matching
+    the Antithesis topology. Upstream's `create()` also wires polaris,
+    sql-server, and an external postgres source — none of those are running
+    in this compose."""
+    from pg8000.native import identifier
 
+    for db in database.dbs:
+        db.drop(exe)
+        db.create(exe)
 
-def choose_action(rng: random.Random) -> str:
-    return rng.choices(
-        [
-            "create_table",
-            "drop_table",
-            "insert",
-            "update",
-            "delete",
-            "select_table",
-            "create_mv",
-            "drop_mv",
-            "select_mv",
-        ],
-        weights=[6, 2, 25, 12, 10, 20, 6, 2, 17],
-        k=1,
-    )[0]
-
-
-def execute_action(
-    conn: psycopg.Connection[Any], rng: random.Random, worker_name: str, action: str
-) -> None:
-    idx = rng.randrange(TABLE_COUNT)
-    table = table_name(idx)
-    mv = mv_name(idx)
-
-    with conn.cursor() as cur:
-        if action == "create_table":
-            cur.execute(
-                f"CREATE TABLE IF NOT EXISTS {table} ("
-                "worker TEXT NOT NULL, "
-                "k BIGINT NOT NULL, "
-                "v BIGINT NOT NULL"
-                ")"
-            )
-        elif action == "drop_table":
-            cur.execute(f"DROP TABLE IF EXISTS {table} CASCADE")
-        elif action == "insert":
-            cur.execute(
-                f"INSERT INTO {table} (worker, k, v) VALUES (%s, %s, %s)",
-                (
-                    worker_name,
-                    rng.randint(0, MAX_KEY),
-                    rng.randint(0, MAX_VALUE),
-                ),
-            )
-        elif action == "update":
-            cur.execute(
-                f"UPDATE {table} SET v = v + 1 WHERE k = %s",
-                (rng.randint(0, MAX_KEY),),
-            )
-        elif action == "delete":
-            cur.execute(
-                f"DELETE FROM {table} WHERE k = %s",
-                (rng.randint(0, MAX_KEY),),
-            )
-        elif action == "select_table":
-            cur.execute(
-                f"SELECT count(*)::bigint, min(v)::bigint, max(v)::bigint FROM {table}"
-            )
-            cur.fetchall()
-        elif action == "create_mv":
-            cur.execute(
-                f"CREATE MATERIALIZED VIEW IF NOT EXISTS {mv} "
-                f"IN CLUSTER {CLUSTER} AS "
-                f"SELECT worker, count(*)::bigint AS c, sum(v)::bigint AS s "
-                f"FROM {table} GROUP BY worker"
-            )
-        elif action == "drop_mv":
-            cur.execute(f"DROP MATERIALIZED VIEW IF EXISTS {mv}")
-        elif action == "select_mv":
-            cur.execute(
-                f"SELECT count(*)::bigint, sum(c)::bigint, sum(s)::bigint FROM {mv}"
-            )
-            cur.fetchall()
-        else:
-            raise ValueError(f"unknown action {action}")
+    exe.execute("SELECT name FROM mz_clusters WHERE name LIKE 'c%'")
+    for row in exe.cur.fetchall():
+        exe.execute(f"DROP CLUSTER {identifier(row[0])} CASCADE")
 
+    exe.execute("DROP SECRET IF EXISTS minio CASCADE")
+    exe.execute("DROP CONNECTION IF EXISTS aws_conn CASCADE")
+    exe.execute("DROP CONNECTION IF EXISTS kafka_conn CASCADE")
+    exe.execute("DROP CONNECTION IF EXISTS csr_conn CASCADE")
 
-def expected_error_reason(exc: BaseException) -> str | None:
-    msg = str(exc)
-    for candidate in EXPECTED_ERROR_SUBSTRINGS:
-        if candidate in msg:
-            return candidate
-    return None
+    exe.execute("SELECT name FROM mz_roles WHERE name LIKE 'r%'")
+    for row in exe.cur.fetchall():
+        exe.execute(f"DROP ROLE {identifier(row[0])}")
 
+    exe.execute(
+        "CREATE CONNECTION IF NOT EXISTS kafka_conn FOR KAFKA "
+        "BROKER 'kafka:9092', SECURITY PROTOCOL PLAINTEXT"
+    )
+    exe.execute(
+        "CREATE CONNECTION IF NOT EXISTS csr_conn FOR CONFLUENT SCHEMA "
+        "REGISTRY URL 'http://schema-registry:8081'"
+    )
+    exe.execute("CREATE SECRET IF NOT EXISTS minio AS 'minioadmin'")
+    exe.execute(
+        "CREATE CONNECTION IF NOT EXISTS aws_conn TO AWS ("
+        "ENDPOINT 'http://minio:9000/', REGION 'minio', "
+        "ACCESS KEY ID 'minioadmin', SECRET ACCESS KEY SECRET minio)"
+    )
 
-def is_connection_error(exc: BaseException) -> bool:
-    return isinstance(exc, (psycopg.OperationalError, psycopg.InterfaceError))
+    for relation in database:
+        relation.create(exe)
+
+
+def _spawn_workers(
+    rng: random.Random,
+    database: Database,
+    end_time: float,
+    num_threads: int,
+) -> tuple[list[Worker], list[threading.Thread]]:
+    """Build the same thread pool `parallel_workload.run()` does for
+    `Complexity.DDL`, minus the per-scenario kill/cancel/backup helper."""
+    weights = [60, 30, 30, 30, 100]
+    workers: list[Worker] = []
+    threads: list[threading.Thread] = []
+    for i in range(num_threads):
+        worker_rng = random.Random(rng.randrange(1_000_000))
+        action_list = worker_rng.choices(
+            [
+                read_action_list,
+                fetch_action_list,
+                write_action_list,
+                dml_nontrans_action_list,
+                ddl_action_list,
+            ],
+            weights,
+        )[0]
+        actions = [
+            action_class(worker_rng, None)
+            for action_class in action_list.action_classes
+        ]
+        worker = Worker(
+            worker_rng,
+            actions,
+            action_list.weights,
+            end_time,
+            action_list.autocommit,
+            system=False,
+            composition=None,
+            action_list=action_list,
+        )
+        workers.append(worker)
+        thread = threading.Thread(
+            name=f"pw-worker-{i}",
+            target=worker.run,
+            args=(PGHOST, PGPORT, 6876, PGUSER, database),
+        )
+        thread.start()
+        threads.append(thread)
+    return workers, threads
 
 
-def run_worker(
-    worker_id: int,
-    seed: int,
-    deadline: float,
-    stop: threading.Event,
-    stats: WorkerStats,
-) -> None:
+def main() -> int:
+    seed = str(helper_random.random_u64())
     rng = random.Random(seed)
-    worker_name = f"pw{worker_id}"
-    conn: psycopg.Connection[Any] | None = None
 
-    try:
-        while time.monotonic() < deadline and not stop.is_set():
-            if conn is None or conn.closed:
-                try:
-                    conn = connect()
-                except Exception as exc:  # noqa: BLE001
-                    if not is_connection_error(exc):
-                        stats.unexpected = {
-                            "worker": worker_name,
-                            "action": "connect",
-                            "error": str(exc),
-                        }
-                        stop.set()
-                        return
-                    stats.reconnects += 1
-                    time.sleep(rng.uniform(0.05, 0.2))
-                    continue
-
-            action = choose_action(rng)
-            try:
-                execute_action(conn, rng, worker_name, action)
-                stats.successes += 1
-                stats.actions[action] += 1
-            except Exception as exc:  # noqa: BLE001
-                if is_connection_error(exc):
-                    stats.reconnects += 1
-                    try:
-                        conn.close()
-                    except Exception:  # noqa: BLE001
-                        pass
-                    conn = None
-                    continue
-
-                reason = expected_error_reason(exc)
-                if reason is not None:
-                    stats.ignored_errors += 1
-                    stats.ignored_by_reason[reason] += 1
-                    stats.actions[action] += 1
-                    continue
-
-                stats.unexpected = {
-                    "worker": worker_name,
-                    "action": action,
-                    "error": str(exc),
-                }
-                LOG.exception("unexpected parallel workload error")
-                stop.set()
-                return
-
-            time.sleep(rng.uniform(0.005, 0.05))
-    finally:
-        if conn is not None:
-            try:
-                conn.close()
-            except Exception:  # noqa: BLE001
-                pass
+    LOG.info(
+        "parallel-workload starting: seed=%s threads=%d runtime=%ss",
+        seed,
+        NUM_THREADS,
+        RUNTIME_S,
+    )
 
+    _prepare_system(NUM_THREADS)
 
-def main() -> int:
-    ensure_shared_objects()
-
-    stop = threading.Event()
-    deadline = time.monotonic() + RUNTIME_S
-    seeds = [helper_random.random_u64() for _ in range(WORKER_THREADS)]
-    stats = [WorkerStats() for _ in range(WORKER_THREADS)]
-    threads = [
-        threading.Thread(
-            name=f"parallel-workload-{idx}",
-            target=run_worker,
-            args=(idx, seeds[idx], deadline, stop, stats[idx]),
-        )
-        for idx in range(WORKER_THREADS)
-    ]
+    # `Scenario.Kill` widens `Action.errors_to_ignore` to absorb connection
+    # drops, which mirrors what Antithesis container-pauses look like at the
+    # client. We never instantiate `KillAction` itself.
+    database = Database(
+        rng=rng,
+        seed=seed,
+        host=PGHOST,
+        ports={
+            "materialized": PGPORT,
+            "mz_system": PGPORT_INTERNAL,
+            "http": 6876,
+            "kafka": 9092,
+            "schema-registry": 8081,
+        },
+        complexity=Complexity.DDL,
+        scenario=Scenario.Kill,
+        naughty_identifiers=False,
+    )
 
-    LOG.info("parallel workload starting; schema=%s threads=%d", SCHEMA, WORKER_THREADS)
-    for thread in threads:
-        thread.start()
-    for thread in threads:
-        thread.join()
-
-    total_successes = sum(worker.successes for worker in stats)
-    total_reconnects = sum(worker.reconnects for worker in stats)
-    total_ignored = sum(worker.ignored_errors for worker in stats)
-    action_counts = Counter[str]()
-    ignored_by_reason = Counter[str]()
-    unexpected = next((worker.unexpected for worker in stats if worker.unexpected), None)
-    for worker in stats:
-        action_counts.update(worker.actions)
-        ignored_by_reason.update(worker.ignored_by_reason)
+    end_time = time.time() + RUNTIME_S
 
-    sometimes(
-        total_successes >= WORKER_THREADS * 5,
-        "parallel workload: randomized concurrent SQL executed successfully",
-        {
-            "successes": total_successes,
-            "threads": WORKER_THREADS,
-            "actions": dict(action_counts),
-            "reconnects": total_reconnects,
-        },
+    setup_failure: Exception | None = None
+    try:
+        with (
+            psycopg.connect(
+                host=PGHOST,
+                port=PGPORT,
+                user=PGUSER,
+                dbname=PGDATABASE,
+                autocommit=True,
+                connect_timeout=15,
+            ) as setup_conn,
+            setup_conn.cursor() as setup_cur,
+        ):
+            setup_exe = Executor(rng, setup_cur, None, database)
+            _create_database_for_antithesis(database, setup_exe)
+    except Exception as exc:  # noqa: BLE001
+        setup_failure = exc
+        LOG.exception("parallel-workload setup failed")
+
+    workers: list[Worker] = []
+    threads: list[threading.Thread] = []
+    worker_failed: WorkerFailedException | None = None
+    if setup_failure is None:
+        workers, threads = _spawn_workers(rng, database, end_time, NUM_THREADS)
+        try:
+            while time.time() < end_time:
+                dead = [t for t in threads if not t.is_alive()]
+                if dead:
+                    occurred = next(
+                        (w.occurred_exception for w in workers if w.occurred_exception),
+                        None,
+                    )
+                    worker_failed = WorkerFailedException(
+                        f"thread {dead[0].name} exited early", occurred
+                    )
+                    for worker in workers:
+                        worker.end_time = time.time()
+                    break
+                time.sleep(0.5)
+        finally:
+            for worker in workers:
+                worker.end_time = time.time()
+            for thread in threads:
+                thread.join(timeout=30)
+
+    total_queries = sum(w.num_queries.total() for w in workers)
+    total_ignored = sum(
+        count
+        for w in workers
+        for counter in w.ignored_errors.values()
+        for count in counter.values()
     )
+
     sometimes(
-        action_counts["create_table"]
-        + action_counts["drop_table"]
-        + action_counts["create_mv"]
-        + action_counts["drop_mv"]
-        > 0,
-        "parallel workload: DDL actions were exercised",
+        total_queries >= NUM_THREADS,
+        "parallel workload: randomized concurrent SQL executed successfully",
         {
-            "create_table": action_counts["create_table"],
-            "drop_table": action_counts["drop_table"],
-            "create_mv": action_counts["create_mv"],
-            "drop_mv": action_counts["drop_mv"],
+            "queries": total_queries,
+            "threads": NUM_THREADS,
+            "ignored_errors": total_ignored,
         },
     )
     sometimes(
         total_ignored > 0,
         "parallel workload: expected concurrent-catalog races were observed",
-        {
-            "ignored_errors": total_ignored,
-            "ignored_by_reason": dict(ignored_by_reason),
-        },
+        {"ignored_errors": total_ignored},
     )
+
+    unexpected = None
+    if setup_failure is not None:
+        unexpected = {"phase": "setup", "error": str(setup_failure)}
+    elif worker_failed is not None:
+        unexpected = {
+            "phase": "worker",
+            "error": (
+                str(worker_failed.cause) if worker_failed.cause else str(worker_failed)
+            ),
+        }
+
     always(
         unexpected is None,
         "parallel workload: no unexpected SQL errors escaped the randomized stress driver",
         {
             "unexpected": unexpected,
-            "successes": total_successes,
+            "queries": total_queries,
             "ignored_errors": total_ignored,
-            "reconnects": total_reconnects,
-            "actions": dict(action_counts),
+            "threads": NUM_THREADS,
         },
     )
 
     LOG.info(
-        "parallel workload done; successes=%d ignored=%d reconnects=%d unexpected=%s",
-        total_successes,
+        "parallel-workload done: queries=%d ignored=%d unexpected=%s",
+        total_queries,
         total_ignored,
-        total_reconnects,
         unexpected,
     )
     return 1 if unexpected is not None else 0
 
 
 if __name__ == "__main__":
-    _ = (PGHOST, PGPORT, PGUSER, PGDATABASE, os)
     sys.exit(main())
diff --git a/test/antithesis/workload/test/parallel_driver_strict_serializable_reads.py b/test/antithesis/workload/test/parallel_driver_strict_serializable_reads.py
index c4af73b434635..19e7d1d698dbc 100755
--- a/test/antithesis/workload/test/parallel_driver_strict_serializable_reads.py
+++ b/test/antithesis/workload/test/parallel_driver_strict_serializable_reads.py
@@ -57,6 +57,7 @@
 
 import helper_random
 import psycopg
+from antithesis.assertions import always, sometimes
 from helper_pg import (
     PGDATABASE,
     PGHOST,
@@ -67,8 +68,6 @@
 from helper_quiet import request_quiet_period
 from helper_table_mv import MV_NAME, TABLE_MV_INPUT, ensure_table_and_mv
 
-from antithesis.assertions import always, sometimes
-
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
 )
@@ -91,14 +90,17 @@ def _fresh_select_count(prefix: str) -> int | None:
     but defends against future changes to the system default.
     """
     try:
-        with psycopg.connect(
-            host=PGHOST,
-            port=PGPORT,
-            user=PGUSER,
-            dbname=PGDATABASE,
-            connect_timeout=PROBE_CONNECT_TIMEOUT_S,
-            autocommit=True,
-        ) as conn, conn.cursor() as cur:
+        with (
+            psycopg.connect(
+                host=PGHOST,
+                port=PGPORT,
+                user=PGUSER,
+                dbname=PGDATABASE,
+                connect_timeout=PROBE_CONNECT_TIMEOUT_S,
+                autocommit=True,
+            ) as conn,
+            conn.cursor() as cur,
+        ):
             cur.execute("SET transaction_isolation TO 'strict serializable'")
             cur.execute(
                 f"SELECT row_count::bigint FROM {MV_NAME} WHERE prefix = %s",
diff --git a/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py b/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py
index 066620aaf6ded..b58c15adcfa34 100755
--- a/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py
+++ b/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py
@@ -37,6 +37,7 @@
 import sys
 
 import helper_random
+from antithesis.assertions import always, sometimes
 from helper_kafka import make_producer
 from helper_pg import query_one_retry
 from helper_quiet import request_quiet_period
@@ -47,8 +48,6 @@
     ensure_upsert_text_source,
 )
 
-from antithesis.assertions import always, sometimes
-
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
 )
diff --git a/test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py b/test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py
index 59385a59a7ac7..53e791185b4ab 100755
--- a/test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py
+++ b/test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py
@@ -53,6 +53,7 @@
 
 import helper_random
 import psycopg
+from antithesis.assertions import always, sometimes
 from helper_pg import (
     PGDATABASE,
     PGHOST,
@@ -62,8 +63,6 @@
     query_retry,
 )
 
-from antithesis.assertions import always, sometimes
-
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
 )
@@ -88,14 +87,17 @@ def _fresh_observed_tables(name_prefix: str) -> set[str] | None:
     rather than blaming the property for a fault-window read.
     """
     try:
-        with psycopg.connect(
-            host=PGHOST,
-            port=PGPORT,
-            user=PGUSER,
-            dbname=PGDATABASE,
-            connect_timeout=int(PROBE_CONNECT_TIMEOUT_S),
-            autocommit=True,
-        ) as conn, conn.cursor() as cur:
+        with (
+            psycopg.connect(
+                host=PGHOST,
+                port=PGPORT,
+                user=PGUSER,
+                dbname=PGDATABASE,
+                connect_timeout=int(PROBE_CONNECT_TIMEOUT_S),
+                autocommit=True,
+            ) as conn,
+            conn.cursor() as cur,
+        ):
             cur.execute(
                 "SELECT name FROM mz_tables WHERE name LIKE %s",
                 (f"{name_prefix}%",),
@@ -155,7 +157,12 @@ def _run_cycle(
         try:
             execute_retry(f"DROP TABLE {table}")
         except Exception as exc:  # noqa: BLE001
-            LOG.info("cycle %d: DROP %s failed (%s); not updating model", cycle_idx, table, exc)
+            LOG.info(
+                "cycle %d: DROP %s failed (%s); not updating model",
+                cycle_idx,
+                table,
+                exc,
+            )
             return False, new_id
         expected.discard(table)
     else:
@@ -163,7 +170,12 @@ def _run_cycle(
         try:
             execute_retry(f"CREATE TABLE {table} (id BIGINT NOT NULL)")
         except Exception as exc:  # noqa: BLE001
-            LOG.info("cycle %d: CREATE %s failed (%s); not updating model", cycle_idx, table, exc)
+            LOG.info(
+                "cycle %d: CREATE %s failed (%s); not updating model",
+                cycle_idx,
+                table,
+                exc,
+            )
             return False, new_id
         expected.add(table)
         new_id += 1
@@ -172,7 +184,9 @@ def _run_cycle(
     # assertion — a fault-window read is not regression evidence.
     observed = _fresh_observed_tables(name_prefix)
     if observed is None:
-        LOG.info("cycle %d: fresh-connection read failed; skipping assertion", cycle_idx)
+        LOG.info(
+            "cycle %d: fresh-connection read failed; skipping assertion", cycle_idx
+        )
         return False, new_id
 
     always(
diff --git a/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py b/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py
index 5f3c13bcdce57..d7ccedb9e1a3b 100755
--- a/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py
+++ b/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py
@@ -58,6 +58,7 @@
 import time
 
 import helper_random
+from antithesis.assertions import always, sometimes
 from helper_kafka import make_producer
 from helper_pg import query_one_retry
 from helper_quiet import request_quiet_period
@@ -68,8 +69,6 @@
     ensure_upsert_text_source,
 )
 
-from antithesis.assertions import always, sometimes
-
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
 )

From d72fc00f59b41ac01b79a4721d0fcf12cd384810 Mon Sep 17 00:00:00 2001
From: Dennis Felsing <dennis@felsing.org>
Date: Wed, 13 May 2026 23:56:27 +0800
Subject: [PATCH 37/65] try to fix logging

---
 .../workload/test/parallel_driver_parallel_workload.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/test/antithesis/workload/test/parallel_driver_parallel_workload.py b/test/antithesis/workload/test/parallel_driver_parallel_workload.py
index 1c8dadf8f641d..d5352500b9440 100644
--- a/test/antithesis/workload/test/parallel_driver_parallel_workload.py
+++ b/test/antithesis/workload/test/parallel_driver_parallel_workload.py
@@ -58,6 +58,7 @@
     PGUSER_INTERNAL,
 )
 
+from materialize.parallel_workload import executor as _pw_executor
 from materialize.parallel_workload.action import (
     ddl_action_list,
     dml_nontrans_action_list,
@@ -83,6 +84,15 @@
 from materialize.parallel_workload.worker import Worker
 from materialize.parallel_workload.worker_exception import WorkerFailedException
 
+# `parallel_workload.executor` declares module-level `logging: TextIO | None`
+# and `lock: threading.Lock` as PEP-526 annotations only; they are bound by
+# `initialize_logging()`. `Executor.log()` does `if not logging: return`,
+# which raises `NameError` before that initialiser runs. We don't want the
+# per-query log file (drivers run many times under Antithesis); bind both
+# names to no-op values so `log()` returns immediately.
+_pw_executor.logging = None
+_pw_executor.lock = threading.Lock()
+
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
 )

From bd5fbc4e02fa20d0825712b58a044d5b24d830f9 Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Wed, 13 May 2026 17:51:08 -0400
Subject: [PATCH 38/65] =?UTF-8?q?test/antithesis:=20helper=5Fpg.query=5Fre?=
 =?UTF-8?q?try=20=E2=80=94=20opt-in=20real=5Ftime=5Frecency=20kwarg?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Wraps the SELECT in a session that has SET real_time_recency = TRUE. Under
strict-serializable, this pushes the chosen-ts lower bound to the source's
real-time upstream frontier, so the SELECT waits for ingestion to reach the
broker/upstream high-water mark before responding.

Existing 'wait_for_catchup' on mz_source_statistics.offset_committed is
insufficient as a queryability gate: offset_committed tracks the data-shard
upper, which can advance past oracle_read_ts via the source's reclock while
the corresponding rows live at an mz_ts further forward (assigned by the
next-probe binding). The strict-serializable SELECT then picks a chosen-ts
between the two and returns count=0.

Used by drivers that produce-then-assert against kafka/mysql sources. MV-over-
table drivers don't need this; tables have no upstream to probe and the table
writer's commit already advances the timestamp oracle.
---
 test/antithesis/workload/test/helper_pg.py | 29 +++++++++++++++++++---
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/test/antithesis/workload/test/helper_pg.py b/test/antithesis/workload/test/helper_pg.py
index e3508c4f44b4a..59a88f1963ab3 100644
--- a/test/antithesis/workload/test/helper_pg.py
+++ b/test/antithesis/workload/test/helper_pg.py
@@ -108,13 +108,32 @@ def execute_retry(sql: str, params: Sequence[Any] | None = None) -> None:
             backoff = min(backoff * 2, _RETRY_MAX_S)
 
 
-def query_retry(sql: str, params: Sequence[Any] | None = None) -> list[tuple[Any, ...]]:
-    """Run a query and return all rows, retrying transient errors."""
+def query_retry(
+    sql: str,
+    params: Sequence[Any] | None = None,
+    real_time_recency: bool = False,
+) -> list[tuple[Any, ...]]:
+    """Run a query and return all rows, retrying transient errors.
+
+    Set `real_time_recency=True` when the query is a queryability gate after a
+    just-produced upstream write. With strict-serializable (the workload
+    default) plus real-time recency, the coordinator pushes the SELECT
+    timestamp's lower bound to the source's real-time frontier — i.e. the
+    SELECT waits for ingestion to reach the broker/upstream's current
+    high-water mark before responding. Without this, `wait_for_catchup` on
+    `mz_source_statistics.offset_committed` can clear before the just-ingested
+    rows are visible at the timestamp the SELECT chooses (`offset_committed`
+    tracks the data-shard upper, which can advance past `oracle_read_ts` while
+    the rows live at an mz_ts further forward — assigned by the reclock's
+    next-probe binding).
+    """
     deadline = time.monotonic() + _RETRY_BUDGET_S
     backoff = _RETRY_INITIAL_S
     while True:
         try:
             with connect() as conn, conn.cursor() as cur:
+                if real_time_recency:
+                    cur.execute("SET real_time_recency = TRUE")
                 cur.execute(sql, params or ())
                 return list(cur.fetchall())
         except Exception as exc:  # noqa: BLE001
@@ -126,9 +145,11 @@ def query_retry(sql: str, params: Sequence[Any] | None = None) -> list[tuple[Any
 
 
 def query_one_retry(
-    sql: str, params: Sequence[Any] | None = None
+    sql: str,
+    params: Sequence[Any] | None = None,
+    real_time_recency: bool = False,
 ) -> tuple[Any, ...] | None:
-    rows = query_retry(sql, params)
+    rows = query_retry(sql, params, real_time_recency=real_time_recency)
     return rows[0] if rows else None
 
 

From 312537f18358a25f2477a34a0a0d1795960feaee Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Wed, 13 May 2026 17:51:32 -0400
Subject: [PATCH 39/65] test/antithesis: drivers use real_time_recency for
 queryability gate

Apply real_time_recency=True to the SELECTs that follow wait_for_catchup or
the equivalent in:

  - parallel_driver_upsert_latest_value.py
  - singleton_driver_upsert_state_rehydration.py
  - parallel_driver_kafka_none_envelope.py
  - parallel_driver_mysql_cdc.py

These drivers all produce upstream (kafka/mysql), wait for a catchup signal,
then SELECT and assert. The current catchup signal (offset_committed in
mz_source_statistics, or a COUNT-based poll) clears before the just-ingested
rows are visible at the strict-serializable read timestamp the SELECT picks:

  * offset_committed reflects the data-shard upper reclocked to upstream
    offsets. It can advance past oracle_read_ts via the source's reclock
    binding while the corresponding rows live at an mz_ts further forward
    (assigned by the next-probe binding).
  * COUNT-based polling only requires a single chosen-ts where the count
    matches; the immediately-following per-row SELECT picks oracle_read_ts
    afresh and can race.

real_time_recency forces the SELECT's chosen-ts lower bound to the source's
real-time upstream frontier, so the SELECT waits for ingestion to reach the
broker's/replica's current high-water mark before responding. See the
docstring on helper_pg.query_retry for the full reasoning.

Not applied to parallel_driver_mv_reflects_table_updates: tables have no
upstream to probe (RTR no-ops), and the existing count-based poll on the MV
is already queryability-based.

Not applied to parallel_driver_strict_serializable_reads: it already opens
fresh connections with explicit SET REAL_TIME_RECENCY TO TRUE.
---
 .../workload/test/parallel_driver_kafka_none_envelope.py    | 6 +++++-
 test/antithesis/workload/test/parallel_driver_mysql_cdc.py  | 6 ++++++
 .../workload/test/parallel_driver_upsert_latest_value.py    | 5 +++++
 .../test/singleton_driver_upsert_state_rehydration.py       | 2 ++
 4 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py b/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py
index 3c1a4e1359793..bbb4e2529eca8 100755
--- a/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py
+++ b/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py
@@ -132,7 +132,9 @@ def main() -> int:
     # ----- no-data-duplication -----
     # `GROUP BY partition, "offset" HAVING COUNT(*) > 1` filtered to this
     # invocation's payloads. The catalog's `kafka-source-no-data-duplication`
-    # property names this exact query shape.
+    # property names this exact query shape. real_time_recency forces the
+    # SELECT past the kafka broker's real-time frontier — see
+    # helper_pg.query_retry for why this is required.
     dup_rows = query_retry(
         f"""
         SELECT partition, "offset", COUNT(*)::bigint
@@ -142,6 +144,7 @@ def main() -> int:
         HAVING COUNT(*) > 1
         """,
         (f"{prefix}:%",),
+        real_time_recency=True,
     )
     always(
         len(dup_rows) == 0,
@@ -175,6 +178,7 @@ def main() -> int:
         GROUP BY 1, 2, 3
         """,
         (f"{prefix}:%",),
+        real_time_recency=True,
     )
     by_payload: dict[str, tuple[int, int, int]] = {}
     for text, partition, offset, count in rows:
diff --git a/test/antithesis/workload/test/parallel_driver_mysql_cdc.py b/test/antithesis/workload/test/parallel_driver_mysql_cdc.py
index c51330251bad8..233207ff8e3c6 100644
--- a/test/antithesis/workload/test/parallel_driver_mysql_cdc.py
+++ b/test/antithesis/workload/test/parallel_driver_mysql_cdc.py
@@ -130,9 +130,14 @@ def _wait_for_catchup(batch_id: str, expected_count: int) -> bool:
 def _check_rows(expected: dict[str, str]) -> None:
     """Assert every expected row has the correct value in the Materialize source."""
     for row_id, want in expected.items():
+        # real_time_recency: the count-based catchup above can clear at a
+        # chosen-ts that just barely satisfies the COUNT, leaving a per-row
+        # SELECT moments later to race. RTR pushes chosen-ts to the mysql
+        # upstream's real-time frontier; see helper_pg.query_retry.
         rows = query_retry(
             f"SELECT value FROM {TABLE_NAME} WHERE id = %s",
             (row_id,),
+            real_time_recency=True,
         )
         found = bool(rows)
         observed = rows[0][0] if found else None
@@ -199,6 +204,7 @@ def main() -> int:
     rows = query_retry(
         f"SELECT COUNT(*)::bigint FROM {TABLE_NAME} WHERE batch_id = %s",
         (batch_id,),
+        real_time_recency=True,
     )
     count_in_mz = int(rows[0][0]) if rows and rows[0][0] is not None else 0
     always(
diff --git a/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py b/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py
index b58c15adcfa34..fcfabea77620d 100755
--- a/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py
+++ b/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py
@@ -85,9 +85,14 @@ def _select_value_for_key(key: str) -> tuple[bool, str | None]:
     out of scope for this property and should be caught by
     `kafka-source-no-data-duplication`.
     """
+    # real_time_recency forces the SELECT timestamp past the kafka source's
+    # real-time upstream frontier, so the row written for this key is visible
+    # at chosen-ts. `wait_for_catchup` on `offset_committed` alone is not
+    # sufficient — see helper_pg.query_retry for the full reasoning.
     row = query_one_retry(
         f"SELECT count(*)::bigint, max(text) FROM {SOURCE_UPSERT_TEXT} WHERE key = %s",
         (key,),
+        real_time_recency=True,
     )
     if row is None:
         return False, None
diff --git a/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py b/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py
index d7ccedb9e1a3b..26342d0ed43e8 100755
--- a/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py
+++ b/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py
@@ -91,9 +91,11 @@
 def _select_value_for_key(key: str) -> tuple[bool, str | None]:
     """Duplicate of `_select_value_for_key` in `parallel_driver_upsert_latest_value.py`.
     Kept inline to avoid expanding helper surface for one shared private function."""
+    # See helper_pg.query_retry for why real_time_recency is required here.
     row = query_one_retry(
         f"SELECT count(*)::bigint, max(text) FROM {SOURCE_UPSERT_TEXT} WHERE key = %s",
         (key,),
+        real_time_recency=True,
     )
     if row is None:
         return False, None

From d43144d5b7d71d3446d46f9435b2f1ac3d6f2d11 Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Wed, 13 May 2026 17:55:43 -0400
Subject: [PATCH 40/65] test/antithesis: parallel_driver_parallel_workload
 setup phase tolerates concurrent races
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Multiple parallel-driver invocations race the deterministic object-name pool
in _create_database_for_antithesis (role0..roleN, cluster-0..cluster-N, etc).
Setup statements run without IF NOT EXISTS / IF EXISTS guards in many places,
and there is no IF EXISTS form for DROP CLUSTER or DROP ROLE — so the loser
of any given race sees:

  * cluster 'cluster-0' already exists
  * unknown role 'role0'
  * unknown cluster 'cluster-0'
  * role "role0" cannot be dropped because some objects depend on it

These are the same concurrent-DDL outcomes the parallel_workload framework
already tolerates inside the worker loop via Action.errors_to_ignore at
DDL complexity. The setup phase had no equivalent tolerance, so any of these
escaped as a setup_failure and the always-zero-exit assertion fired:
  always(unexpected is None, "parallel workload: no unexpected SQL errors …")

Add _tolerate_setup_race that catches QueryError or Exception with any of the
expected race substrings and proceeds. Wrap every setup statement, including
db.drop/db.create, the cluster/role enumerate-and-drop loops, the
DROP/CREATE CONNECTION + SECRET statements, and the per-relation create loop.

The pattern list mirrors action.Action.errors_to_ignore for the DDL tier.
---
 .../test/parallel_driver_parallel_workload.py | 95 +++++++++++++++----
 1 file changed, 78 insertions(+), 17 deletions(-)

diff --git a/test/antithesis/workload/test/parallel_driver_parallel_workload.py b/test/antithesis/workload/test/parallel_driver_parallel_workload.py
index d5352500b9440..ec3b91e9b38c6 100644
--- a/test/antithesis/workload/test/parallel_driver_parallel_workload.py
+++ b/test/antithesis/workload/test/parallel_driver_parallel_workload.py
@@ -58,6 +58,7 @@
     PGUSER_INTERNAL,
 )
 
+from materialize.data_ingest.query_error import QueryError
 from materialize.parallel_workload import executor as _pw_executor
 from materialize.parallel_workload.action import (
     ddl_action_list,
@@ -175,47 +176,107 @@ def _prepare_system(num_threads: int) -> None:
             )
 
 
+# Expected substring matches for SQL errors raised during the setup phase when
+# multiple parallel-driver invocations race the same deterministic object
+# names (`role0`, `cluster-0`, etc.). Each invocation does best-effort cleanup
+# + create; whoever loses the race sees one of these and continues. The same
+# patterns are already tolerated by the parallel_workload framework itself in
+# `action.Action.errors_to_ignore` for the DDL complexity tier, so the setup
+# phase tolerates the same surface area.
+_SETUP_RACE_PATTERNS = (
+    "already exists",
+    "unknown role",
+    "unknown cluster",
+    "unknown schema",
+    "unknown catalog item",
+    "cannot be dropped because",
+    "was concurrently dropped",
+    "was removed",
+    "' was dropped",
+    "was dropped while executing a statement",
+    "another session modified the catalog",
+    "object state changed while transaction was in progress",
+)
+
+
+def _tolerate_setup_race(fn, *args, **kwargs):
+    """Run `fn(...)`, swallowing the concurrent-race messages in
+    `_SETUP_RACE_PATTERNS` and propagating anything else.
+
+    The setup phase is invoked by every parallel-driver invocation, and the
+    framework picks deterministic object names from a small pool. Concurrent
+    invocations therefore race to drop-then-create the same names; any single
+    race outcome is fine because the per-invocation Database object only
+    needs its named objects to exist by the time worker threads start.
+    """
+    try:
+        return fn(*args, **kwargs)
+    except QueryError as exc:
+        if any(pat in (exc.msg or "") for pat in _SETUP_RACE_PATTERNS):
+            LOG.debug("setup tolerated: %s — %s", exc.query, exc.msg)
+            return None
+        raise
+    except Exception as exc:  # noqa: BLE001
+        if any(pat in str(exc) for pat in _SETUP_RACE_PATTERNS):
+            LOG.debug("setup tolerated: %s", exc)
+            return None
+        raise
+
+
 def _create_database_for_antithesis(database: Database, exe: Executor) -> None:
     """Stand-in for `Database.create` that only sets up connections matching
     the Antithesis topology. Upstream's `create()` also wires polaris,
     sql-server, and an external postgres source — none of those are running
-    in this compose."""
+    in this compose.
+
+    Every statement is wrapped with `_tolerate_setup_race` because parallel
+    invocations of this driver race the same deterministic object names
+    (`role0..roleN`, `cluster-0..cluster-N`). Whoever loses the race for a
+    given object sees a known race message — already-exists, unknown-role,
+    unknown-cluster, or a transient DEPENDS-ON cleanup mismatch — and the
+    other invocation's outcome is fine for our purposes.
+    """
     from pg8000.native import identifier
 
     for db in database.dbs:
-        db.drop(exe)
-        db.create(exe)
+        _tolerate_setup_race(db.drop, exe)
+        _tolerate_setup_race(db.create, exe)
 
     exe.execute("SELECT name FROM mz_clusters WHERE name LIKE 'c%'")
     for row in exe.cur.fetchall():
-        exe.execute(f"DROP CLUSTER {identifier(row[0])} CASCADE")
+        _tolerate_setup_race(
+            exe.execute, f"DROP CLUSTER {identifier(row[0])} CASCADE"
+        )
 
-    exe.execute("DROP SECRET IF EXISTS minio CASCADE")
-    exe.execute("DROP CONNECTION IF EXISTS aws_conn CASCADE")
-    exe.execute("DROP CONNECTION IF EXISTS kafka_conn CASCADE")
-    exe.execute("DROP CONNECTION IF EXISTS csr_conn CASCADE")
+    _tolerate_setup_race(exe.execute, "DROP SECRET IF EXISTS minio CASCADE")
+    _tolerate_setup_race(exe.execute, "DROP CONNECTION IF EXISTS aws_conn CASCADE")
+    _tolerate_setup_race(exe.execute, "DROP CONNECTION IF EXISTS kafka_conn CASCADE")
+    _tolerate_setup_race(exe.execute, "DROP CONNECTION IF EXISTS csr_conn CASCADE")
 
     exe.execute("SELECT name FROM mz_roles WHERE name LIKE 'r%'")
     for row in exe.cur.fetchall():
-        exe.execute(f"DROP ROLE {identifier(row[0])}")
+        _tolerate_setup_race(exe.execute, f"DROP ROLE {identifier(row[0])}")
 
-    exe.execute(
+    _tolerate_setup_race(
+        exe.execute,
         "CREATE CONNECTION IF NOT EXISTS kafka_conn FOR KAFKA "
-        "BROKER 'kafka:9092', SECURITY PROTOCOL PLAINTEXT"
+        "BROKER 'kafka:9092', SECURITY PROTOCOL PLAINTEXT",
     )
-    exe.execute(
+    _tolerate_setup_race(
+        exe.execute,
         "CREATE CONNECTION IF NOT EXISTS csr_conn FOR CONFLUENT SCHEMA "
-        "REGISTRY URL 'http://schema-registry:8081'"
+        "REGISTRY URL 'http://schema-registry:8081'",
     )
-    exe.execute("CREATE SECRET IF NOT EXISTS minio AS 'minioadmin'")
-    exe.execute(
+    _tolerate_setup_race(exe.execute, "CREATE SECRET IF NOT EXISTS minio AS 'minioadmin'")
+    _tolerate_setup_race(
+        exe.execute,
         "CREATE CONNECTION IF NOT EXISTS aws_conn TO AWS ("
         "ENDPOINT 'http://minio:9000/', REGION 'minio', "
-        "ACCESS KEY ID 'minioadmin', SECRET ACCESS KEY SECRET minio)"
+        "ACCESS KEY ID 'minioadmin', SECRET ACCESS KEY SECRET minio)",
     )
 
     for relation in database:
-        relation.create(exe)
+        _tolerate_setup_race(relation.create, exe)
 
 
 def _spawn_workers(

From 26a70cee0475ba9cf7f29fa3c1cab7a109566ed2 Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Wed, 13 May 2026 17:57:38 -0400
Subject: [PATCH 41/65] test/antithesis: _replica_non_online queries history
 table, not current-state view
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Sometimes assertions:
  * fault recovery: observed antithesis_cluster replica non-online at least once
  * kafka source resumes: observed antithesis_cluster replica non-online

both rely on `_replica_non_online()` returning True at least once across all
invocations in a run. The previous implementation queried
`mz_cluster_replica_statuses` (DISTINCT ON (replica_id, process_id) over the
underlying history shard), which shows only the latest tick per process. With
a 0.5s probe cadence and a 30s invocation budget, an Antithesis fault that
takes a clusterd offline-then-back-online within a sub-second window slips
between two consecutive polls — the SDK never sees a non-online status, the
Sometimes assertion never fires, and we get a 0-pass / N-fail finding even
though the fault recipe is correctly hitting the cluster.

Switch to `mz_internal.mz_cluster_replica_status_history` and filter on
`h.status = 'offline'`. This is the underlying audit log; any past offline
event remains visible from any later poll within the retention window, so we
record the fault even if the transition fully completed before the next probe.

Same change in both drivers (the helper was duplicated).
---
 .../test/anytime_fault_recovery_exercised.py   | 18 ++++++++++++++----
 ...anytime_kafka_source_resumes_after_fault.py | 12 +++++++++---
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/test/antithesis/workload/test/anytime_fault_recovery_exercised.py b/test/antithesis/workload/test/anytime_fault_recovery_exercised.py
index ff90867b0b6f5..65f3ed4f695f0 100755
--- a/test/antithesis/workload/test/anytime_fault_recovery_exercised.py
+++ b/test/antithesis/workload/test/anytime_fault_recovery_exercised.py
@@ -93,7 +93,17 @@ def _probe_select_one() -> bool:
 
 
 def _replica_non_online() -> bool:
-    """Best-effort: is any antithesis-cluster replica reporting non-online?
+    """Did any antithesis_cluster replica record an `offline` status at any
+    point in this timeline?
+
+    Queries `mz_cluster_replica_status_history` (audit log) rather than
+    `mz_cluster_replica_statuses` (current-state view). The current-state
+    view shows only the latest tick per (replica, process), so a transient
+    offline window — exactly the shape Antithesis fault injection creates
+    when it pauses or kills clusterd1 / clusterd2 for a few seconds — can
+    open and close between two consecutive polls and the assertion never
+    fires. The history table is sticky: once an offline event is recorded
+    it stays observable from any later poll within the retention window.
 
     Uses the retry-budgeted query helper because we want a clear yes/no, not
     a probe outcome — if the helper can't get an answer we conservatively
@@ -105,10 +115,10 @@ def _replica_non_online() -> bool:
             """
             SELECT EXISTS (
                 SELECT 1
-                FROM mz_internal.mz_cluster_replica_statuses s
-                JOIN mz_cluster_replicas r ON r.id = s.replica_id
+                FROM mz_internal.mz_cluster_replica_status_history h
+                JOIN mz_cluster_replicas r ON r.id = h.replica_id
                 JOIN mz_clusters c ON c.id = r.cluster_id
-                WHERE c.name = %s AND s.status != 'online'
+                WHERE c.name = %s AND h.status = 'offline'
             )
             """,
             (ANTITHESIS_CLUSTER,),
diff --git a/test/antithesis/workload/test/anytime_kafka_source_resumes_after_fault.py b/test/antithesis/workload/test/anytime_kafka_source_resumes_after_fault.py
index b453f62631aac..9c10879bd8291 100755
--- a/test/antithesis/workload/test/anytime_kafka_source_resumes_after_fault.py
+++ b/test/antithesis/workload/test/anytime_kafka_source_resumes_after_fault.py
@@ -100,15 +100,21 @@ def _offset_committed(source_name: str) -> int | None:
 
 
 def _replica_non_online() -> bool:
+    """Did any antithesis_cluster replica record an `offline` status in this
+    timeline? Queries the audit history (`mz_cluster_replica_status_history`)
+    rather than the current-state view so a transient offline window between
+    two polls is still observable. See the matching helper in
+    `anytime_fault_recovery_exercised.py` for the full reasoning.
+    """
     try:
         row = query_one_retry(
             """
             SELECT EXISTS (
                 SELECT 1
-                FROM mz_internal.mz_cluster_replica_statuses s
-                JOIN mz_cluster_replicas r ON r.id = s.replica_id
+                FROM mz_internal.mz_cluster_replica_status_history h
+                JOIN mz_cluster_replicas r ON r.id = h.replica_id
                 JOIN mz_clusters c ON c.id = r.cluster_id
-                WHERE c.name = %s AND s.status != 'online'
+                WHERE c.name = %s AND h.status = 'offline'
             )
             """,
             (ANTITHESIS_CLUSTER,),

From adaed905255634354b94aef816f60a3764c62069 Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Wed, 13 May 2026 18:17:00 -0400
Subject: [PATCH 42/65] parallel_workload: pool-backed mode with seed-scoped
 names and external clusterds
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three coupled additions to make parallel_workload safe to run as multiple
concurrent invocations sharing one Materialize instance, with each
invocation's cluster routed to a dedicated external clusterd container.

  1. Database(seed_scoped_names: bool = False). When True, forwards a
     'name_scope' string to every Role and Cluster the framework creates,
     producing 'cluster-<seed>-<id>' and 'role-<seed>-<id>' rather than
     'cluster-<id>' / 'role<id>'. Schemas / tables / views / sources etc.
     don't need this — their fully-qualified names already flow through
     DB.name() which already embeds the seed. Default False so non-
     Antithesis consumers keep their existing name shapes.

     Role.__str__ now passes through pg8000.native.identifier() so the
     quoted-dashed names round-trip correctly; no-op for ASCII names.

  2. Database(pool_members: list[ClusterdPoolMember] | None = None) and
     a new ClusterdPoolMember dataclass (host + storagectl/computectl/
     compute/storage ports + workers). When set, the framework provisions
     unmanaged Cluster replicas with explicit STORAGECTL/STORAGE/
     COMPUTECTL/COMPUTE ADDRESSES pointed at the supplied member(s)
     instead of emitting managed SIZE/REPLICATION FACTOR. The is_pool_backed
     property on Cluster gates the rendering.

  3. CreateClusterAction / CreateClusterReplicaAction / DropClusterReplicaAction
     skip pool-backed clusters: there is no in-band allocator for grabbing
     additional pool members from a worker thread, and replication factor
     manipulation has no analogue in unmanaged-replica mode. The framework
     therefore only ever touches the pool members the caller pre-allocated.

These three pieces only make sense together: seed-scoping by itself
doesn't isolate the clusterd workload; the pool backend by itself collides
on global names; skipping the dynamic DDL by itself would just leave
clusters un-grown in a managed-cluster topology where that's the workload.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../materialize/parallel_workload/action.py   |  23 +-
 .../materialize/parallel_workload/database.py | 207 ++++++++++++++++--
 2 files changed, 206 insertions(+), 24 deletions(-)

diff --git a/misc/python/materialize/parallel_workload/action.py b/misc/python/materialize/parallel_workload/action.py
index 3318bee4aeaf5..89d9c0801fc76 100644
--- a/misc/python/materialize/parallel_workload/action.py
+++ b/misc/python/materialize/parallel_workload/action.py
@@ -1987,7 +1987,7 @@ def run(self, exe: Executor) -> bool:
                 return False
             role_id = exe.db.role_id
             exe.db.role_id += 1
-        role = Role(role_id)
+        role = Role(role_id, name_scope=exe.db.name_scope)
         role.create(exe)
         exe.db.roles.append(role)
         return True
@@ -2026,6 +2026,13 @@ def run(self, exe: Executor) -> bool:
 
 class CreateClusterAction(Action):
     def run(self, exe: Executor) -> bool:
+        # In pool mode the Database's clusters are wired to pre-existing
+        # clusterd containers from a finite pool the caller passed in.
+        # Dynamically creating a new cluster would need to claim an unused
+        # pool member, and we don't have an allocator. Skip — the initial
+        # clusters set up at construction time are the test surface.
+        if exe.db.pool_members is not None:
+            return False
         with exe.db.lock:
             if len(exe.db.clusters) >= MAX_CLUSTERS:
                 return False
@@ -2037,6 +2044,7 @@ def run(self, exe: Executor) -> bool:
             size=self.rng.choice(["1", "2"]),
             replication_factor=self.rng.choice([1, 2]),
             introspection_interval="1s",
+            name_scope=exe.db.name_scope,
         )
         cluster.create(exe)
         exe.db.clusters.append(cluster)
@@ -2170,6 +2178,12 @@ def run(self, exe: Executor) -> bool:
         with exe.db.lock:
             # Keep cluster 0 with 1 replica for sources/sinks
             unmanaged_clusters = [c for c in exe.db.clusters[1:] if not c.managed]
+            # Pool-backed clusters can't grow their replica count — there's
+            # no pool allocator handing out a fresh ClusterdPoolMember per
+            # ALTER CLUSTER ADD REPLICA. Skip them.
+            unmanaged_clusters = [
+                c for c in unmanaged_clusters if not c.is_pool_backed
+            ]
             if not unmanaged_clusters:
                 return False
             cluster = self.rng.choice(unmanaged_clusters)
@@ -2193,6 +2207,13 @@ def run(self, exe: Executor) -> bool:
         with exe.db.lock:
             # Keep cluster 0 with 1 replica for sources/sinks
             unmanaged_clusters = [c for c in exe.db.clusters[1:] if not c.managed]
+            # Pool-backed clusters can't shrink either — without an
+            # allocator to release the pool member back, the in-memory
+            # model would diverge from materialize's catalog and later
+            # creates targeting the freed slot would conflict.
+            unmanaged_clusters = [
+                c for c in unmanaged_clusters if not c.is_pool_backed
+            ]
             if not unmanaged_clusters:
                 return False
             cluster = self.rng.choice(unmanaged_clusters)
diff --git a/misc/python/materialize/parallel_workload/database.py b/misc/python/materialize/parallel_workload/database.py
index bad0b4081bbde..4eecbdf2ea2f3 100644
--- a/misc/python/materialize/parallel_workload/database.py
+++ b/misc/python/materialize/parallel_workload/database.py
@@ -7,6 +7,7 @@
 # the Business Source License, use of this software will be governed
 # by the Apache License, Version 2.0.
 
+import dataclasses
 import random
 import threading
 import uuid
@@ -885,31 +886,81 @@ def __str__(self) -> str:
 class Role:
     role_id: int
     lock: threading.Lock
-
-    def __init__(self, role_id: int):
+    # Inserted between `role` and `{role_id}` in the generated name. Empty by
+    # default (giving the historical `role0` shape). When set, gives
+    # `role{name_scope}{role_id}` — used by callers like the Antithesis
+    # parallel-driver where many concurrent Database instances against one
+    # materialize would otherwise collide on the same `role0..roleN` names.
+    name_scope: str
+
+    def __init__(self, role_id: int, name_scope: str = ""):
         self.role_id = role_id
         self.lock = threading.Lock()
+        self.name_scope = name_scope
 
     def __str__(self) -> str:
+        # Format: `role[-{name_scope}-]{role_id}`. The bracketed segment is
+        # only present when seed-scoping is on, so the historical `role0`
+        # shape (which non-Antithesis consumers parse) is preserved.
+        # Scoped names need identifier-quoting because dashes aren't valid
+        # in an unquoted identifier; unscoped names stay bare to match the
+        # original SQL the framework emits.
+        if self.name_scope:
+            return identifier(f"role-{self.name_scope}-{self.role_id}")
         return f"role{self.role_id}"
 
     def create(self, exe: Executor) -> None:
         exe.execute(f"CREATE ROLE {self}")
 
 
+@dataclasses.dataclass(frozen=True)
+class ClusterdPoolMember:
+    """One entry in an external clusterd pool that a `Cluster` can target as
+    an unmanaged replica.
+
+    Used by callers (Antithesis parallel-driver) that want fault-isolation
+    per cluster: each pool member is its own container, so Antithesis can
+    kill/pause/partition exactly one cluster's storage+compute without
+    taking down the other clusters that share the materialized container's
+    process orchestrator.
+
+    The default ports match clusterd's defaults; override per environment.
+    """
+
+    host: str
+    storagectl_port: int = 2100
+    computectl_port: int = 2101
+    compute_port: int = 2102
+    storage_port: int = 2103
+    workers: int = 4
+
+
 class ClusterReplica:
     replica_id: int
     size: str
     cluster: "Cluster"
     rename: int
     lock: threading.Lock
+    # When non-None, the replica is wired to a pre-existing clusterd
+    # container via unmanaged-cluster syntax (STORAGECTL/COMPUTE ADDRESSES)
+    # rather than provisioned through the orchestrator. The replica's
+    # `size` field is ignored in that case; `pool_member.workers` provides
+    # the WORKERS clause.
+    pool_member: ClusterdPoolMember | None
 
-    def __init__(self, replica_id: int, size: str, cluster: "Cluster"):
+    def __init__(
+        self,
+        replica_id: int,
+        size: str,
+        cluster: "Cluster",
+        pool_member: ClusterdPoolMember | None = None,
+    ):
         self.replica_id = replica_id
         self.size = size
         self.cluster = cluster
         self.rename = 0
         self.lock = threading.Lock()
+        self.pool_member = pool_member
 
     def name(self) -> str:
         if self.rename:
@@ -935,6 +986,12 @@ class Cluster:
     introspection_interval: str
     rename: int
     lock: threading.Lock
+    # Inserted between `cluster` and `-{cluster_id}` in the generated name.
+    # Empty by default (giving the historical `cluster-N` shape). When set,
+    # gives `cluster{name_scope}-N` — used by callers like the Antithesis
+    # parallel-driver, where many concurrent Database instances against one
+    # materialize would otherwise collide on the same `cluster-N` names.
+    name_scope: str
 
     def __init__(
         self,
@@ -943,29 +1000,84 @@ def __init__(
         size: str,
         replication_factor: int,
         introspection_interval: str,
+        name_scope: str = "",
+        pool_members: list[ClusterdPoolMember] | None = None,
     ):
         self.cluster_id = cluster_id
         self.managed = managed
         self.size = size
-        self.replicas = [
-            ClusterReplica(i, size, self) for i in range(replication_factor)
-        ]
+        # When `pool_members` is supplied, the cluster runs in unmanaged mode
+        # against one pre-existing clusterd container per replica. We force
+        # `managed=False` (the unmanaged-cluster syntax is what carries the
+        # STORAGECTL/COMPUTE ADDRESSES clauses) and ignore `replication_factor`
+        # in favour of `len(pool_members)`.
+        if pool_members is not None:
+            if not pool_members:
+                raise ValueError(
+                    "pool_members must be non-empty when provided; one member per replica"
+                )
+            self.managed = False
+            self.replicas = [
+                ClusterReplica(i, size, self, pool_member=pool_members[i])
+                for i in range(len(pool_members))
+            ]
+        else:
+            self.replicas = [
+                ClusterReplica(i, size, self) for i in range(replication_factor)
+            ]
         self.replica_id = len(self.replicas)
         self.introspection_interval = introspection_interval
         self.rename = 0
         self.lock = threading.Lock()
+        self.name_scope = name_scope
+
+    @property
+    def is_pool_backed(self) -> bool:
+        """True iff every replica is wired to a pre-existing clusterd
+        container rather than provisioned through the orchestrator. Action
+        classes that would mutate replica count check this and bail —
+        we don't dynamically allocate from the pool."""
+        return all(r.pool_member is not None for r in self.replicas)
 
     def name(self) -> str:
+        # Format: `cluster[-{name_scope}]-{cluster_id}[-{rename}]`. The
+        # bracketed `-{name_scope}` segment is only present when seed-
+        # scoping is on, so the historical `cluster-0` / `cluster-0-1`
+        # shapes (which non-Antithesis consumers parse) are preserved.
+        prefix = (
+            f"cluster-{self.name_scope}" if self.name_scope else "cluster"
+        )
         if self.rename:
-            return naughtify(f"cluster-{self.cluster_id}-{self.rename}")
-        return naughtify(f"cluster-{self.cluster_id}")
+            return naughtify(f"{prefix}-{self.cluster_id}-{self.rename}")
+        return naughtify(f"{prefix}-{self.cluster_id}")
 
     def __str__(self) -> str:
         return identifier(self.name())
 
     def create(self, exe: Executor) -> None:
         query = f"CREATE CLUSTER {self} "
-        if self.managed:
+        if self.is_pool_backed:
+            # Unmanaged cluster pointing at pre-existing clusterd containers.
+            # Each replica gets the STORAGECTL/STORAGE/COMPUTECTL/COMPUTE
+            # ADDRESSES of its pool member; WORKERS comes from the pool
+            # member's config. Requires
+            # `unsafe_enable_unorchestrated_cluster_replicas = true` on the
+            # SUT (see test/antithesis/mzcompose.py for the Antithesis case).
+            replica_specs = []
+            for replica in self.replicas:
+                assert replica.pool_member is not None
+                m = replica.pool_member
+                replica_specs.append(
+                    f"{replica} ("
+                    f"STORAGECTL ADDRESSES ['{m.host}:{m.storagectl_port}'], "
+                    f"STORAGE ADDRESSES ['{m.host}:{m.storage_port}'], "
+                    f"COMPUTECTL ADDRESSES ['{m.host}:{m.computectl_port}'], "
+                    f"COMPUTE ADDRESSES ['{m.host}:{m.compute_port}'], "
+                    f"WORKERS {m.workers}"
+                    f")"
+                )
+            query += "REPLICAS(" + ", ".join(replica_specs) + ")"
+        elif self.managed:
             query += f"SIZE = '{self.size}', REPLICATION FACTOR = {len(self.replicas)}, INTROSPECTION INTERVAL = '{self.introspection_interval}'"
         else:
             query += "REPLICAS("
@@ -1025,12 +1137,35 @@ def __init__(
         complexity: Complexity,
         scenario: Scenario,
         naughty_identifiers: bool,
+        # When True, top-level objects whose names are not schema-qualified
+        # (clusters and roles) are scoped by the database seed so concurrent
+        # Database instances against one materialize don't collide. Off by
+        # default; opted into by the Antithesis parallel-driver where many
+        # invocations share the SUT. Tables / schemas / views are already
+        # qualified by DB.name() which includes the seed, so they don't
+        # need this.
+        seed_scoped_names: bool = False,
+        # When non-None, every cluster the Database creates uses the
+        # external clusterd-pool backend (unmanaged-with-explicit-addresses)
+        # rather than the orchestrator. The Database slices this list one
+        # member per replica across its clusters at construction time.
+        # See `ClusterdPoolMember` for the shape; sized to fit the
+        # database's initial cluster + replica plan.
+        pool_members: list[ClusterdPoolMember] | None = None,
     ):
         self.host = host
         self.ports = ports
         self.complexity = complexity
         self.scenario = scenario
         self.seed = seed
+        self.seed_scoped_names = seed_scoped_names
+        self.pool_members = pool_members
+        # The bare seed (no leading/trailing punctuation) used by Cluster /
+        # Role / etc. to assemble their scoped names. Empty when seed-scoping
+        # is off, in which case those classes fall back to their historical
+        # `cluster-N` / `role0` shapes. See Cluster.name() and Role.__str__()
+        # for how the seed gets inlaid.
+        self.name_scope = seed if seed_scoped_names else ""
         set_naughty_identifiers(naughty_identifiers)
 
         self.s3_path = 0
@@ -1064,21 +1199,47 @@ def __init__(
             )
             self.views.append(view)
         self.view_id = len(self.views)
-        self.roles = [Role(i) for i in range(rng.randint(0, MAX_INITIAL_ROLES))]
-        self.role_id = len(self.roles)
-        # At least one storage cluster required for WebhookSources
-        self.clusters = [
-            Cluster(
-                i,
-                managed=rng.choice([True, False]),
-                size=rng.choice(
-                    ["scale=1,workers=1", "scale=1,workers=4", "scale=2,workers=2"]
-                ),
-                replication_factor=1,
-                introspection_interval="1s",
-            )
-            for i in range(rng.randint(1, MAX_INITIAL_CLUSTERS))
+        self.roles = [
+            Role(i, name_scope=self.name_scope)
+            for i in range(rng.randint(0, MAX_INITIAL_ROLES))
         ]
+        self.role_id = len(self.roles)
+        # At least one storage cluster required for WebhookSources.
+        # In pool mode, each cluster claims one pool member from a
+        # deterministic slice; the number of clusters is the slice size, no
+        # rng.randint. Caller is responsible for sizing `pool_members` to
+        # the desired cluster count.
+        if pool_members is not None:
+            initial_cluster_count = len(pool_members)
+            self.clusters = [
+                Cluster(
+                    i,
+                    # managed/size are ignored when pool-backed but kept as
+                    # placeholder values for any code that reads them
+                    # without consulting `is_pool_backed`.
+                    managed=False,
+                    size=pool_members[i].host,
+                    replication_factor=1,
+                    introspection_interval="1s",
+                    name_scope=self.name_scope,
+                    pool_members=[pool_members[i]],
+                )
+                for i in range(initial_cluster_count)
+            ]
+        else:
+            self.clusters = [
+                Cluster(
+                    i,
+                    managed=rng.choice([True, False]),
+                    size=rng.choice(
+                        ["scale=1,workers=1", "scale=1,workers=4", "scale=2,workers=2"]
+                    ),
+                    replication_factor=1,
+                    introspection_interval="1s",
+                    name_scope=self.name_scope,
+                )
+                for i in range(rng.randint(1, MAX_INITIAL_CLUSTERS))
+            ]
         self.cluster_id = len(self.clusters)
         self.indexes = set()
         self.webhook_sources = [

From 19db5376106104507a9e881afc9fd342cdcacb96 Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Wed, 13 May 2026 18:27:05 -0400
Subject: [PATCH 43/65] test/antithesis: add configurable clusterd pool for
 parallel-workload
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reserve a pool of pre-existing clusterd containers
(`clusterd-pool-{0..N-1}`) so each parallel-workload cluster can land on
its own container and Antithesis can fault-inject it in isolation.
Without the pool, parallel-workload's clusters all live as child
processes of environmentd under the materialized container's process
orchestrator, and Antithesis can only kill / pause / partition the
container as a unit.

Pool size is read from `ANTITHESIS_CLUSTERD_POOL_SIZE` (env), default 8.
Each member is identical to clusterd1 / clusterd2: 4 timely workers,
no scratch, restart=no. `workflow_default` brings them up before
materialized so the controller can reach them when CREATE CLUSTER
references their addresses.

`Materialized` already has `unsafe_enable_unorchestrated_cluster_replicas`
set so CREATE CLUSTER ... STORAGECTL ADDRESSES is accepted.

config/docker-compose.yaml regenerated via
  bin/pyactivate test/antithesis/export-compose.py
to match — the YAML is generated, not hand-edited.
---
 test/antithesis/config/docker-compose.yaml | 304 +++++++++++++++++++++
 test/antithesis/mzcompose.py               |  44 +++
 2 files changed, 348 insertions(+)

diff --git a/test/antithesis/config/docker-compose.yaml b/test/antithesis/config/docker-compose.yaml
index 446c9d0a189f6..900b586870e75 100644
--- a/test/antithesis/config/docker-compose.yaml
+++ b/test/antithesis/config/docker-compose.yaml
@@ -311,6 +311,310 @@ services:
     stop_grace_period: 120s
     platform: linux/amd64
     image: ${MATERIALIZED_IMAGE}
+  clusterd-pool-0:
+    entrypoint:
+    - tini
+    - --
+    command:
+    - clusterd
+    ports:
+    - 2100
+    - 2101
+    - 6878
+    environment:
+    - CLUSTERD_GRPC_HOST=clusterd-pool-0
+    - CLUSTERD_USE_CTP=true
+    - MZ_SOFT_ASSERTIONS=1
+    - CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2100
+    - CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2101
+    - CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=0.0.0.0:6878
+    - CLUSTERD_SECRETS_READER=local-file
+    - CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=/mzdata/secrets
+    - LD_PRELOAD=libeatmydata.so
+    - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879
+    - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0
+    - CLUSTERD_PROCESS=0
+    - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-0:2102"],
+      "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
+      false, "zero_copy_limit": null}'
+    - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-0:2103"],
+      "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
+      false, "zero_copy_limit": null}'
+    volumes:
+    - mzdata:/mzdata
+    - mydata:/var/lib/mysql-files
+    - tmp:/share/tmp
+    - scratch:/scratch
+    restart: 'no'
+    stop_grace_period: 120s
+    platform: linux/amd64
+    image: ${MATERIALIZED_IMAGE}
+  clusterd-pool-1:
+    entrypoint:
+    - tini
+    - --
+    command:
+    - clusterd
+    ports:
+    - 2100
+    - 2101
+    - 6878
+    environment:
+    - CLUSTERD_GRPC_HOST=clusterd-pool-1
+    - CLUSTERD_USE_CTP=true
+    - MZ_SOFT_ASSERTIONS=1
+    - CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2100
+    - CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2101
+    - CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=0.0.0.0:6878
+    - CLUSTERD_SECRETS_READER=local-file
+    - CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=/mzdata/secrets
+    - LD_PRELOAD=libeatmydata.so
+    - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879
+    - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0
+    - CLUSTERD_PROCESS=0
+    - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-1:2102"],
+      "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
+      false, "zero_copy_limit": null}'
+    - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-1:2103"],
+      "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
+      false, "zero_copy_limit": null}'
+    volumes:
+    - mzdata:/mzdata
+    - mydata:/var/lib/mysql-files
+    - tmp:/share/tmp
+    - scratch:/scratch
+    restart: 'no'
+    stop_grace_period: 120s
+    platform: linux/amd64
+    image: ${MATERIALIZED_IMAGE}
+  clusterd-pool-2:
+    entrypoint:
+    - tini
+    - --
+    command:
+    - clusterd
+    ports:
+    - 2100
+    - 2101
+    - 6878
+    environment:
+    - CLUSTERD_GRPC_HOST=clusterd-pool-2
+    - CLUSTERD_USE_CTP=true
+    - MZ_SOFT_ASSERTIONS=1
+    - CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2100
+    - CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2101
+    - CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=0.0.0.0:6878
+    - CLUSTERD_SECRETS_READER=local-file
+    - CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=/mzdata/secrets
+    - LD_PRELOAD=libeatmydata.so
+    - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879
+    - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0
+    - CLUSTERD_PROCESS=0
+    - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-2:2102"],
+      "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
+      false, "zero_copy_limit": null}'
+    - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-2:2103"],
+      "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
+      false, "zero_copy_limit": null}'
+    volumes:
+    - mzdata:/mzdata
+    - mydata:/var/lib/mysql-files
+    - tmp:/share/tmp
+    - scratch:/scratch
+    restart: 'no'
+    stop_grace_period: 120s
+    platform: linux/amd64
+    image: ${MATERIALIZED_IMAGE}
+  clusterd-pool-3:
+    entrypoint:
+    - tini
+    - --
+    command:
+    - clusterd
+    ports:
+    - 2100
+    - 2101
+    - 6878
+    environment:
+    - CLUSTERD_GRPC_HOST=clusterd-pool-3
+    - CLUSTERD_USE_CTP=true
+    - MZ_SOFT_ASSERTIONS=1
+    - CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2100
+    - CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2101
+    - CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=0.0.0.0:6878
+    - CLUSTERD_SECRETS_READER=local-file
+    - CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=/mzdata/secrets
+    - LD_PRELOAD=libeatmydata.so
+    - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879
+    - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0
+    - CLUSTERD_PROCESS=0
+    - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-3:2102"],
+      "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
+      false, "zero_copy_limit": null}'
+    - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-3:2103"],
+      "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
+      false, "zero_copy_limit": null}'
+    volumes:
+    - mzdata:/mzdata
+    - mydata:/var/lib/mysql-files
+    - tmp:/share/tmp
+    - scratch:/scratch
+    restart: 'no'
+    stop_grace_period: 120s
+    platform: linux/amd64
+    image: ${MATERIALIZED_IMAGE}
+  clusterd-pool-4:
+    entrypoint:
+    - tini
+    - --
+    command:
+    - clusterd
+    ports:
+    - 2100
+    - 2101
+    - 6878
+    environment:
+    - CLUSTERD_GRPC_HOST=clusterd-pool-4
+    - CLUSTERD_USE_CTP=true
+    - MZ_SOFT_ASSERTIONS=1
+    - CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2100
+    - CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2101
+    - CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=0.0.0.0:6878
+    - CLUSTERD_SECRETS_READER=local-file
+    - CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=/mzdata/secrets
+    - LD_PRELOAD=libeatmydata.so
+    - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879
+    - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0
+    - CLUSTERD_PROCESS=0
+    - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-4:2102"],
+      "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
+      false, "zero_copy_limit": null}'
+    - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-4:2103"],
+      "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
+      false, "zero_copy_limit": null}'
+    volumes:
+    - mzdata:/mzdata
+    - mydata:/var/lib/mysql-files
+    - tmp:/share/tmp
+    - scratch:/scratch
+    restart: 'no'
+    stop_grace_period: 120s
+    platform: linux/amd64
+    image: ${MATERIALIZED_IMAGE}
+  clusterd-pool-5:
+    entrypoint:
+    - tini
+    - --
+    command:
+    - clusterd
+    ports:
+    - 2100
+    - 2101
+    - 6878
+    environment:
+    - CLUSTERD_GRPC_HOST=clusterd-pool-5
+    - CLUSTERD_USE_CTP=true
+    - MZ_SOFT_ASSERTIONS=1
+    - CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2100
+    - CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2101
+    - CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=0.0.0.0:6878
+    - CLUSTERD_SECRETS_READER=local-file
+    - CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=/mzdata/secrets
+    - LD_PRELOAD=libeatmydata.so
+    - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879
+    - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0
+    - CLUSTERD_PROCESS=0
+    - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-5:2102"],
+      "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
+      false, "zero_copy_limit": null}'
+    - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-5:2103"],
+      "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
+      false, "zero_copy_limit": null}'
+    volumes:
+    - mzdata:/mzdata
+    - mydata:/var/lib/mysql-files
+    - tmp:/share/tmp
+    - scratch:/scratch
+    restart: 'no'
+    stop_grace_period: 120s
+    platform: linux/amd64
+    image: ${MATERIALIZED_IMAGE}
+  clusterd-pool-6:
+    entrypoint:
+    - tini
+    - --
+    command:
+    - clusterd
+    ports:
+    - 2100
+    - 2101
+    - 6878
+    environment:
+    - CLUSTERD_GRPC_HOST=clusterd-pool-6
+    - CLUSTERD_USE_CTP=true
+    - MZ_SOFT_ASSERTIONS=1
+    - CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2100
+    - CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2101
+    - CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=0.0.0.0:6878
+    - CLUSTERD_SECRETS_READER=local-file
+    - CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=/mzdata/secrets
+    - LD_PRELOAD=libeatmydata.so
+    - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879
+    - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0
+    - CLUSTERD_PROCESS=0
+    - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-6:2102"],
+      "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
+      false, "zero_copy_limit": null}'
+    - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-6:2103"],
+      "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
+      false, "zero_copy_limit": null}'
+    volumes:
+    - mzdata:/mzdata
+    - mydata:/var/lib/mysql-files
+    - tmp:/share/tmp
+    - scratch:/scratch
+    restart: 'no'
+    stop_grace_period: 120s
+    platform: linux/amd64
+    image: ${MATERIALIZED_IMAGE}
+  clusterd-pool-7:
+    entrypoint:
+    - tini
+    - --
+    command:
+    - clusterd
+    ports:
+    - 2100
+    - 2101
+    - 6878
+    environment:
+    - CLUSTERD_GRPC_HOST=clusterd-pool-7
+    - CLUSTERD_USE_CTP=true
+    - MZ_SOFT_ASSERTIONS=1
+    - CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2100
+    - CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2101
+    - CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=0.0.0.0:6878
+    - CLUSTERD_SECRETS_READER=local-file
+    - CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=/mzdata/secrets
+    - LD_PRELOAD=libeatmydata.so
+    - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879
+    - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0
+    - CLUSTERD_PROCESS=0
+    - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-7:2102"],
+      "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
+      false, "zero_copy_limit": null}'
+    - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-7:2103"],
+      "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
+      false, "zero_copy_limit": null}'
+    volumes:
+    - mzdata:/mzdata
+    - mydata:/var/lib/mysql-files
+    - tmp:/share/tmp
+    - scratch:/scratch
+    restart: 'no'
+    stop_grace_period: 120s
+    platform: linux/amd64
+    image: ${MATERIALIZED_IMAGE}
   materialized:
     hostname: materialized
     depends_on:
diff --git a/test/antithesis/mzcompose.py b/test/antithesis/mzcompose.py
index 5f7da9d8f0e97..d66c63eb3348f 100644
--- a/test/antithesis/mzcompose.py
+++ b/test/antithesis/mzcompose.py
@@ -20,6 +20,16 @@
                         Antithesis killing either container exercises the
                         compute/storage-replica recovery and rebalancing
                         paths without taking the cluster offline.
+  - clusterd-pool-{0..N-1} : a configurable pool of external clusterd
+                        containers that the parallel-workload driver
+                        claims one-per-cluster to give each
+                        parallel-workload cluster its own container.
+                        Without this pool, parallel-workload clusters
+                        would all share materialized's process orchestrator
+                        and Antithesis could only fault the entire
+                        container as a unit. Pool size is controlled by
+                        the `ANTITHESIS_CLUSTERD_POOL_SIZE` env var (read
+                        from the harness; defaults to 8).
   - materialized      : the SUT (environmentd; clusterd is external)
   - workload          : Python test driver wired to the Antithesis SDK
 
@@ -28,6 +38,8 @@
   bin/pyactivate test/antithesis/export-compose.py > config/...     # dump compose YAML
 """
 
+import os
+
 from materialize.mzcompose.composition import Composition
 from materialize.mzcompose.service import Service, ServiceConfig
 from materialize.mzcompose.services.clusterd import Clusterd
@@ -39,6 +51,15 @@
 from materialize.mzcompose.services.schema_registry import SchemaRegistry
 from materialize.mzcompose.services.zookeeper import Zookeeper
 
+# Number of pool clusterd containers reserved for parallel-workload clusters
+# (one container per cluster, giving each its own container-level fault
+# domain). Read from the env so CI/local runs can tune it without editing
+# this file. Default 8 — enough for ~8 concurrent parallel-driver
+# invocations under the v1 "one cluster per invocation, replication
+# factor 1" allocation, see test/antithesis/workload/test/
+# parallel_driver_parallel_workload.py.
+CLUSTERD_POOL_SIZE = int(os.environ.get("ANTITHESIS_CLUSTERD_POOL_SIZE", "8"))
+
 
 class Workload(Service):
     """Antithesis workload client — Python test driver."""
@@ -146,6 +167,27 @@ def __init__(self) -> None:
         workers=4,
         scratch_directory=None,
     ),
+    # Pool of identical clusterd containers reserved for the
+    # parallel-workload driver. Each instance is a possible target for
+    # one parallel-workload cluster, giving that cluster its own
+    # container-level fault domain (Antithesis can kill / pause /
+    # partition / throttle a specific pool member without affecting any
+    # other cluster). Same settings as clusterd1/clusterd2: 4 timely
+    # workers per process, no scratch (matches production), restart=no
+    # so Antithesis fault injection isn't fought by docker-compose.
+    #
+    # Sizing rationale lives in test/antithesis/workload/test/
+    # parallel_driver_parallel_workload.py — the driver maps invocation
+    # seed → pool slot deterministically and assumes the pool is at
+    # least as big as the expected concurrent-invocation count.
+    *[
+        Clusterd(
+            name=f"clusterd-pool-{i}",
+            workers=4,
+            scratch_directory=None,
+        )
+        for i in range(CLUSTERD_POOL_SIZE)
+    ],
     Materialized(
         external_blob_store=True,
         external_metadata_store=True,
@@ -166,6 +208,7 @@ def __init__(self) -> None:
 
 def workflow_default(c: Composition) -> None:
     """Bring up the Antithesis test cluster."""
+    pool_services = [f"clusterd-pool-{i}" for i in range(CLUSTERD_POOL_SIZE)]
     c.up(
         "postgres-metadata",
         "minio",
@@ -174,6 +217,7 @@ def workflow_default(c: Composition) -> None:
         "schema-registry",
         "clusterd1",
         "clusterd2",
+        *pool_services,
         "mysql",
         "mysql-replica",
     )

From 550f6f65d97f575c057aa2e7422452ef6cbfd2a0 Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Wed, 13 May 2026 18:29:37 -0400
Subject: [PATCH 44/65] test/antithesis: parallel-workload driver runs on
 per-invocation pool clusterd
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Wires the antithesis parallel-workload driver to:

  * Claim one clusterd-pool-{i} container per invocation via a real
    allocator (fcntl.flock on /tmp/clusterd-pool-slots/{i}.lock; lock
    held for the lifetime of the invocation, released on normal return
    or exception). Slots are tried in randomized order so the slot a
    driver lands on doesn't correlate with the invocation seed. If
    every slot is held the driver tags a sometimes() and exits cleanly
    rather than running unisolated.

  * Construct ClusterdPoolMember(host='clusterd-pool-<slot>', workers=4)
    and pass to Database(pool_members=[member], seed_scoped_names=True).
    The initial cluster lands on its own clusterd container — Antithesis
    fault injection targets that container in isolation, which is the
    point of the whole change.

  * Scope setup-phase catalog sweeps to objects this invocation owns:
    'cluster-<seed>-%' and 'role-<seed>-%'. The previous 'c%' / 'r%'
    patterns would have torn down every concurrent invocation's
    still-running state. The shared connections (kafka_conn, csr_conn,
    aws_conn, minio) live outside any seed-scoped database; we never
    drop them (CREATE ... IF NOT EXISTS is idempotent and dropping
    would CASCADE through another invocation's in-flight sources).

  * Drop seed-scoped clusters / databases / roles in main()'s finally
    so each invocation leaves the catalog clean and frees its pool-
    slot's clusterd. The DROP CLUSTER on an unmanaged cluster
    re-arms the clusterd to accept a fresh controller connection via
    the same reconcile() path that handles environmentd restarts
    (storage_state::reconcile drops stale objects, transport::serve
    cancels the prior connection on the next connect).

Pool size is read from CLUSTERD_POOL_SIZE (env), matching the
ANTITHESIS_CLUSTERD_POOL_SIZE knob in test/antithesis/mzcompose.py.
Default 8.

v1 scope (documented for the next round of work):
  * MAX_INITIAL_CLUSTERS = 1 per invocation, REPLICATION FACTOR = 1.
    Multi-replica coverage stays in antithesis_cluster.
  * CreateClusterAction / CreateClusterReplicaAction /
    DropClusterReplicaAction are skipped in pool mode; no in-band
    allocator inside the framework yet.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../test/parallel_driver_parallel_workload.py | 311 +++++++++++++++---
 1 file changed, 259 insertions(+), 52 deletions(-)

diff --git a/test/antithesis/workload/test/parallel_driver_parallel_workload.py b/test/antithesis/workload/test/parallel_driver_parallel_workload.py
index ec3b91e9b38c6..7d370f2048222 100644
--- a/test/antithesis/workload/test/parallel_driver_parallel_workload.py
+++ b/test/antithesis/workload/test/parallel_driver_parallel_workload.py
@@ -38,6 +38,8 @@
 
 from __future__ import annotations
 
+import contextlib
+import fcntl
 import logging
 import os
 import random
@@ -78,6 +80,7 @@
     MAX_TABLES,
     MAX_VIEWS,
     MAX_WEBHOOK_SOURCES,
+    ClusterdPoolMember,
     Database,
 )
 from materialize.parallel_workload.executor import Executor
@@ -105,6 +108,30 @@
 RUNTIME_S = float(os.environ.get("PW_RUNTIME_S", "20"))
 NUM_THREADS = int(os.environ.get("PW_THREADS", "4"))
 
+# Number of clusterd-pool-{i} containers reserved for the parallel-workload
+# driver. Must match the pool actually deployed in
+# test/antithesis/mzcompose.py (ANTITHESIS_CLUSTERD_POOL_SIZE there →
+# CLUSTERD_POOL_SIZE here). Each parallel-workload invocation claims one
+# slot via `fcntl.flock` (see `_claim_pool_slot`); the lock is held for
+# the lifetime of the invocation so concurrent driver processes inside
+# the workload container can't pick the same clusterd.
+CLUSTERD_POOL_SIZE = int(os.environ.get("CLUSTERD_POOL_SIZE", "8"))
+
+# Workers configured per clusterd-pool-{i} process. Must match the
+# `Clusterd(..., workers=...)` argument in test/antithesis/mzcompose.py
+# or the unmanaged CREATE CLUSTER REPLICA's `WORKERS` count will diverge
+# from what clusterd actually runs.
+CLUSTERD_POOL_WORKERS = 4
+
+# Filesystem locks let concurrent parallel-workload invocations claim
+# distinct clusterd-pool members without coordinating through the SUT.
+# All invocations exec inside the single `workload` container so a
+# regular flock on a tmpfs path is sufficient (no cross-container
+# coordination required).
+POOL_SLOT_LOCK_DIR = os.environ.get(
+    "CLUSTERD_POOL_SLOT_LOCK_DIR", "/tmp/clusterd-pool-slots"
+)
+
 
 def _alter_system(cur: psycopg.Cursor[Any], stmt: str) -> None:
     try:
@@ -223,37 +250,152 @@ def _tolerate_setup_race(fn, *args, **kwargs):
         raise
 
 
+@contextlib.contextmanager
+def _claim_pool_slot(rng: random.Random):
+    """Hold an exclusive `fcntl.flock` on a pool-slot lockfile for the
+    duration of the `with` block. Yields the slot index, or `None` if every
+    slot is busy.
+
+    Slots are tried in a randomized order so the slot a driver lands on
+    doesn't correlate with deterministic state (test composer seed, wall
+    clock). The lock is released when the context exits — either normally
+    or via exception — so a crashing driver doesn't strand the slot.
+
+    All parallel-workload driver invocations share the workload container's
+    filesystem, so a plain flock on a tmpfs path under `POOL_SLOT_LOCK_DIR`
+    is sufficient to serialize claims. If the path can't be created we fall
+    back to yielding `None` (caller must handle: the existing setup-tolerance
+    path can absorb a slot collision, it just costs us pool isolation for
+    that one invocation).
+    """
+    try:
+        os.makedirs(POOL_SLOT_LOCK_DIR, exist_ok=True)
+    except OSError as exc:
+        LOG.warning("pool slot lock dir %s unavailable: %s", POOL_SLOT_LOCK_DIR, exc)
+        yield None
+        return
+
+    slots = list(range(CLUSTERD_POOL_SIZE))
+    rng.shuffle(slots)
+    for slot in slots:
+        path = os.path.join(POOL_SLOT_LOCK_DIR, f"{slot}.lock")
+        fd = os.open(path, os.O_RDWR | os.O_CREAT, 0o600)
+        try:
+            fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
+        except OSError:
+            # Another invocation owns this slot; try the next one.
+            os.close(fd)
+            continue
+        try:
+            yield slot
+        finally:
+            try:
+                fcntl.flock(fd, fcntl.LOCK_UN)
+            finally:
+                os.close(fd)
+        return
+    LOG.warning("all %d pool slots are claimed; running without isolation", CLUSTERD_POOL_SIZE)
+    yield None
+
+
+def _drop_seed_scoped_objects(seed: str) -> None:
+    """Drop everything this invocation's seed owns: its clusters, roles, and
+    databases. Called from `main()`'s finally so each invocation leaves the
+    catalog clean and frees its pool-slot's clusterd to be claimed by the
+    next driver run (DROP CLUSTER tears down the unmanaged replica → the
+    clusterd's existing controller connection ends → the next CREATE
+    CLUSTER pointed at the same address claims it via fresh reconcile).
+
+    Errors here are logged and swallowed: leftover objects only cost a bit
+    of catalog footprint until the next invocation's setup sweep picks them
+    up. Don't let a cleanup failure turn into an assertion failure.
+    """
+    from pg8000.native import identifier
+
+    try:
+        with (
+            psycopg.connect(
+                host=PGHOST,
+                port=PGPORT,
+                user=PGUSER,
+                dbname=PGDATABASE,
+                autocommit=True,
+                connect_timeout=15,
+            ) as conn,
+            conn.cursor() as cur,
+        ):
+            # `seed` is u64-derived; safe to splice. We can't use psycopg's
+            # parameter binding for `LIKE` patterns here without forcing the
+            # caller to think about driver-specific placeholder syntax —
+            # inline f-strings match the rest of this module.
+            def _drop(sql: str) -> None:
+                try:
+                    cur.execute(sql.encode())
+                except Exception as exc:  # noqa: BLE001
+                    LOG.debug("cleanup tolerated: %s — %s", sql, exc)
+
+            cur.execute(
+                f"SELECT name FROM mz_clusters WHERE name LIKE 'cluster-{seed}-%'".encode()
+            )
+            for row in cur.fetchall():
+                _drop(f"DROP CLUSTER {identifier(row[0])} CASCADE")
+
+            cur.execute(
+                f"SELECT name FROM mz_databases WHERE name LIKE 'db-pw-{seed}-%'".encode()
+            )
+            for row in cur.fetchall():
+                _drop(f"DROP DATABASE {identifier(row[0])} CASCADE")
+
+            cur.execute(
+                f"SELECT name FROM mz_roles WHERE name LIKE 'role-{seed}-%'".encode()
+            )
+            for row in cur.fetchall():
+                _drop(f"DROP ROLE {identifier(row[0])}")
+    except Exception as exc:  # noqa: BLE001
+        LOG.warning("cleanup connection failed: %s", exc)
+
+
 def _create_database_for_antithesis(database: Database, exe: Executor) -> None:
     """Stand-in for `Database.create` that only sets up connections matching
     the Antithesis topology. Upstream's `create()` also wires polaris,
     sql-server, and an external postgres source — none of those are running
     in this compose.
 
-    Every statement is wrapped with `_tolerate_setup_race` because parallel
-    invocations of this driver race the same deterministic object names
-    (`role0..roleN`, `cluster-0..cluster-N`). Whoever loses the race for a
-    given object sees a known race message — already-exists, unknown-role,
-    unknown-cluster, or a transient DEPENDS-ON cleanup mismatch — and the
-    other invocation's outcome is fine for our purposes.
+    Catalog sweeps are scoped to objects this invocation owns: clusters
+    matching `cluster-{seed}-%` and roles matching `role-{seed}-%`. The
+    seed-scoped names are produced by `Database(seed_scoped_names=True)`;
+    cleaning anything broader would delete state belonging to other
+    concurrent invocations sharing the same SUT.
+
+    The shared connections / secret (`kafka_conn`, `csr_conn`, `aws_conn`,
+    `minio`) live outside any seed-scoped database and are required by every
+    invocation. We never drop them — `CREATE ... IF NOT EXISTS` is
+    idempotent and dropping would CASCADE through another invocation's
+    in-flight sources.
+
+    Setup-phase statements are wrapped with `_tolerate_setup_race` so a
+    losing race against another invocation creating the same shared object
+    (or against our own scoped leftovers being already absent) doesn't kill
+    the driver.
     """
     from pg8000.native import identifier
 
+    seed = database.seed
+
     for db in database.dbs:
         _tolerate_setup_race(db.drop, exe)
         _tolerate_setup_race(db.create, exe)
 
-    exe.execute("SELECT name FROM mz_clusters WHERE name LIKE 'c%'")
+    # `seed` is the random_u64 the driver minted at the top of main(), so
+    # it's already safe to splice into SQL literally. `Executor.execute`
+    # takes a query string and doesn't support parameter binding.
+    exe.execute(f"SELECT name FROM mz_clusters WHERE name LIKE 'cluster-{seed}-%'")
     for row in exe.cur.fetchall():
         _tolerate_setup_race(
             exe.execute, f"DROP CLUSTER {identifier(row[0])} CASCADE"
         )
 
-    _tolerate_setup_race(exe.execute, "DROP SECRET IF EXISTS minio CASCADE")
-    _tolerate_setup_race(exe.execute, "DROP CONNECTION IF EXISTS aws_conn CASCADE")
-    _tolerate_setup_race(exe.execute, "DROP CONNECTION IF EXISTS kafka_conn CASCADE")
-    _tolerate_setup_race(exe.execute, "DROP CONNECTION IF EXISTS csr_conn CASCADE")
-
-    exe.execute("SELECT name FROM mz_roles WHERE name LIKE 'r%'")
+    exe.execute(f"SELECT name FROM mz_roles WHERE name LIKE 'role-{seed}-%'")
     for row in exe.cur.fetchall():
         _tolerate_setup_race(exe.execute, f"DROP ROLE {identifier(row[0])}")
 
@@ -340,9 +482,65 @@ def main() -> int:
 
     _prepare_system(NUM_THREADS)
 
+    # Claim one clusterd-pool-{i} container for this invocation. The flock
+    # is held until main() returns; concurrent invocations inside the
+    # workload container can't pick the same slot. If every slot is busy
+    # the context manager yields `None` — we tag that with a sometimes()
+    # for visibility and exit cleanly (the property surface for this
+    # invocation just doesn't get exercised).
+    #
+    # Each parallel-workload cluster lands on its own clusterd-pool-{slot}
+    # container, giving Antithesis per-cluster fault isolation. Without
+    # this, every parallel-workload cluster would be a child process of
+    # environmentd under the materialized container's process orchestrator,
+    # and the only container-level fault would be "the whole world".
+    with _claim_pool_slot(rng) as pool_slot:
+        sometimes(
+            pool_slot is not None,
+            "parallel workload: clusterd pool slot claimed",
+            {"pool_size": CLUSTERD_POOL_SIZE},
+        )
+        if pool_slot is None:
+            LOG.info(
+                "parallel-workload exiting cleanly: no pool slot available "
+                "(pool_size=%d)",
+                CLUSTERD_POOL_SIZE,
+            )
+            return 0
+        pool_member = ClusterdPoolMember(
+            host=f"clusterd-pool-{pool_slot}",
+            workers=CLUSTERD_POOL_WORKERS,
+        )
+        LOG.info(
+            "parallel-workload claimed pool slot %d (%s)",
+            pool_slot,
+            pool_member.host,
+        )
+        return _run_invocation(seed, rng, pool_member)
+
+
+def _run_invocation(
+    seed: str,
+    rng: random.Random,
+    pool_member: ClusterdPoolMember,
+) -> int:
+    """The bulk of `main()` once a pool slot has been claimed. Split out so
+    the slot lock stays held across this whole call: the lock is released
+    when the enclosing `with` block in `main()` exits.
+    """
+
     # `Scenario.Kill` widens `Action.errors_to_ignore` to absorb connection
     # drops, which mirrors what Antithesis container-pauses look like at the
     # client. We never instantiate `KillAction` itself.
+    #
+    # `seed_scoped_names=True` keeps cluster/role names from colliding when
+    # concurrent invocations share the SUT — see _SETUP_RACE_PATTERNS for
+    # the fallback when they collide anyway.
+    #
+    # `pool_members=[pool_member]` puts this invocation's single cluster
+    # on the pool member above; the framework forces managed=False and
+    # emits unmanaged CREATE CLUSTER with explicit STORAGECTL/COMPUTE
+    # ADDRESSES.
     database = Database(
         rng=rng,
         seed=seed,
@@ -357,54 +555,63 @@ def main() -> int:
         complexity=Complexity.DDL,
         scenario=Scenario.Kill,
         naughty_identifiers=False,
+        seed_scoped_names=True,
+        pool_members=[pool_member],
     )
 
     end_time = time.time() + RUNTIME_S
 
     setup_failure: Exception | None = None
-    try:
-        with (
-            psycopg.connect(
-                host=PGHOST,
-                port=PGPORT,
-                user=PGUSER,
-                dbname=PGDATABASE,
-                autocommit=True,
-                connect_timeout=15,
-            ) as setup_conn,
-            setup_conn.cursor() as setup_cur,
-        ):
-            setup_exe = Executor(rng, setup_cur, None, database)
-            _create_database_for_antithesis(database, setup_exe)
-    except Exception as exc:  # noqa: BLE001
-        setup_failure = exc
-        LOG.exception("parallel-workload setup failed")
-
     workers: list[Worker] = []
     threads: list[threading.Thread] = []
     worker_failed: WorkerFailedException | None = None
-    if setup_failure is None:
-        workers, threads = _spawn_workers(rng, database, end_time, NUM_THREADS)
+    try:
         try:
-            while time.time() < end_time:
-                dead = [t for t in threads if not t.is_alive()]
-                if dead:
-                    occurred = next(
-                        (w.occurred_exception for w in workers if w.occurred_exception),
-                        None,
-                    )
-                    worker_failed = WorkerFailedException(
-                        f"thread {dead[0].name} exited early", occurred
-                    )
-                    for worker in workers:
-                        worker.end_time = time.time()
-                    break
-                time.sleep(0.5)
-        finally:
-            for worker in workers:
-                worker.end_time = time.time()
-            for thread in threads:
-                thread.join(timeout=30)
+            with (
+                psycopg.connect(
+                    host=PGHOST,
+                    port=PGPORT,
+                    user=PGUSER,
+                    dbname=PGDATABASE,
+                    autocommit=True,
+                    connect_timeout=15,
+                ) as setup_conn,
+                setup_conn.cursor() as setup_cur,
+            ):
+                setup_exe = Executor(rng, setup_cur, None, database)
+                _create_database_for_antithesis(database, setup_exe)
+        except Exception as exc:  # noqa: BLE001
+            setup_failure = exc
+            LOG.exception("parallel-workload setup failed")
+
+        if setup_failure is None:
+            workers, threads = _spawn_workers(rng, database, end_time, NUM_THREADS)
+            try:
+                while time.time() < end_time:
+                    dead = [t for t in threads if not t.is_alive()]
+                    if dead:
+                        occurred = next(
+                            (w.occurred_exception for w in workers if w.occurred_exception),
+                            None,
+                        )
+                        worker_failed = WorkerFailedException(
+                            f"thread {dead[0].name} exited early", occurred
+                        )
+                        for worker in workers:
+                            worker.end_time = time.time()
+                        break
+                    time.sleep(0.5)
+            finally:
+                for worker in workers:
+                    worker.end_time = time.time()
+                for thread in threads:
+                    thread.join(timeout=30)
+    finally:
+        # Always free this invocation's seed-scoped state, including its
+        # pool-slot cluster, so the next driver invocation can claim the
+        # slot cleanly. Wrapped in try/except inside the helper; any
+        # cleanup failure is logged but never escapes.
+        _drop_seed_scoped_objects(seed)
 
     total_queries = sum(w.num_queries.total() for w in workers)
     total_ignored = sum(

From 008830b0b5faae7c9b6fe0eb00bd6f39f3c2a01b Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Wed, 13 May 2026 19:00:40 -0400
Subject: [PATCH 45/65] test/antithesis/scratchbook: per-cluster fault
 isolation for parallel-workload

Documents the pool-backed parallel-workload topology: pool of clusterd
containers, file-lock slot allocator, seed-scoped naming, drop-on-exit,
and the reconcile-path correctness argument for clusterd reuse across
DROP/CREATE CLUSTER cycles. Lists current failure modes (all-slots-held,
crash-before-cleanup, sizing) and v1 limitations the next round of work
will close.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../parallel-workload-fault-isolation.md      | 163 ++++++++++++++++++
 1 file changed, 163 insertions(+)
 create mode 100644 test/antithesis/scratchbook/parallel-workload-fault-isolation.md

diff --git a/test/antithesis/scratchbook/parallel-workload-fault-isolation.md b/test/antithesis/scratchbook/parallel-workload-fault-isolation.md
new file mode 100644
index 0000000000000..8cbe235b8183b
--- /dev/null
+++ b/test/antithesis/scratchbook/parallel-workload-fault-isolation.md
@@ -0,0 +1,163 @@
+# Per-cluster fault isolation for parallel-workload under Antithesis
+
+## Problem
+
+Antithesis fault injection operates at the docker-compose container
+boundary: it kills, pauses, partitions, throttles individual containers.
+For per-cluster fault coverage to be observable, each cluster the SUT
+allocates needs to live in its own container — otherwise "fault one
+cluster" reduces to "fault every cluster sharing this container".
+
+The Antithesis compose has one `materialized` container running
+environmentd. By default, every cluster a workload provisions becomes a
+clusterd child process under that environmentd's process orchestrator.
+Antithesis cannot fault a single child process; the smallest fault unit
+is the whole `materialized` container, which is "the entire SUT".
+
+The `antithesis_cluster` (the always-on user cluster the long-running
+workloads target) is already an unmanaged cluster pointed at two external
+clusterd containers (`clusterd1`, `clusterd2`), one per replica. That
+gives us per-replica fault coverage for that cluster.
+
+The gap is `parallel-workload` clusters. The randomized stress driver
+creates new clusters as part of its action surface. Without external
+clusterds, every parallel-workload cluster collapses back onto
+environmentd's process orchestrator and the fault domain disappears.
+
+## Solution
+
+A pool of identical pre-deployed clusterd containers
+(`clusterd-pool-{0..N-1}`), one container per parallel-workload
+invocation. Each invocation claims one slot via filesystem locking,
+provisions its sole cluster as an unmanaged replica pointed at that
+slot's clusterd, and releases the slot on exit.
+
+Components, bottom-up:
+
+  - **`Clusterd(name="clusterd-pool-{i}", workers=4, scratch_directory=None)`**
+    in `test/antithesis/mzcompose.py`. Same configuration as
+    `clusterd1`/`clusterd2`: four timely workers per process (so
+    Antithesis thread-pause faults have something distinct to pause),
+    mem_env RocksDB (matches production, no scratch volume to fight over).
+    Pool size from env (`ANTITHESIS_CLUSTERD_POOL_SIZE`, default 8).
+
+  - **`parallel_workload.Database(pool_members=...,
+    seed_scoped_names=True)`**. Opt-in framework mode: when
+    `pool_members` is set, the framework provisions unmanaged clusters
+    with explicit STORAGECTL/STORAGE/COMPUTECTL/COMPUTE ADDRESSES
+    instead of managed SIZE/REPLICATION FACTOR; the CreateCluster /
+    CreateReplica / DropReplica actions skip pool-backed clusters
+    because there is no in-band allocator. `seed_scoped_names=True`
+    renames `cluster{N}` / `role{N}` to `cluster-{seed}-{N}` /
+    `role-{seed}-{N}` so concurrent invocations don't collide on
+    global names.
+
+  - **`_claim_pool_slot()`** in
+    `test/antithesis/workload/test/parallel_driver_parallel_workload.py`.
+    Contextmanager that holds `fcntl.flock(LOCK_EX | LOCK_NB)` on
+    `/tmp/clusterd-pool-slots/{i}.lock` for the lifetime of the
+    invocation. Slots tried in randomized order so allocation is
+    decorrelated from invocation seed. The lock is released on context
+    exit (normal or exception), so a crashing driver doesn't strand the
+    slot.
+
+  - **`_drop_seed_scoped_objects()`** in the same driver, called in
+    `main()`'s `finally`. Drops every cluster / database / role whose
+    name starts with `cluster-{seed}-` / `db-pw-{seed}-` /
+    `role-{seed}-`. The DROP CLUSTER re-arms the clusterd to be
+    claimed by the next invocation through the reconcile path
+    (see below).
+
+## Clusterd reuse correctness
+
+The pool design assumes a DROP CLUSTER followed by a CREATE CLUSTER
+pointed at the same clusterd is a supported transition. It is — this is
+the same reconciliation path that handles environmentd restart. The
+three pieces:
+
+  1. **Transport cancels the prior connection on every new connect.**
+     `src/service/src/transport.rs::serve` drops the old
+     connection-task token and awaits the task before installing a
+     fresh handler from `handler_fn()`. The new `ClusterClient` is a
+     blank-slate wrapper around the same `Arc<Mutex<TimelyContainer>>`.
+
+  2. **The worker `run` loop survives client disconnects.**
+     `src/storage/src/storage_state.rs::Worker::run` is
+     `while let Some((nonce, rx, tx)) = client_rx.blocking_recv() {
+     run_client(rx, tx); }`. When the old `cmd_tx` is dropped (because
+     the cancel above tore down the prior client), `run_client` returns
+     and the outer loop awaits the next `(nonce, rx, tx)` — the new
+     controller's connection. Worker in-memory state stays resident
+     between connections.
+
+  3. **`reconcile()` drops stale state.** The new controller's first
+     batch of commands ending in `InitializationComplete` is processed
+     by `storage_state::reconcile`: it computes `expected_objects` from
+     the new commands, identifies `stale_objects` as anything the
+     worker knows about that the new controller did not ask for, and
+     `drop_collection`s each one — releasing source tokens (which tears
+     down Kafka consumers, persist write handles, upsert RocksDB state),
+     dropping dataflows, clearing reported frontiers.
+
+Collection IDs do not collide across cluster lifetimes because
+Materialize allocates them globally (`u<n>`, `t<n>`), not per cluster.
+
+The one piece intentionally shared across reconnects is the
+`Arc<PersistClientCache>`. It is keyed by URL+credentials, not by
+cluster identity, and reusing it is the standard production behavior
+(avoids reauthenticating to S3 / postgres-metadata on every reconnect).
+
+The same analysis holds for the compute side (`src/compute/src/server.rs`
+uses the same `ClusterSpec` pattern).
+
+## Failure modes
+
+  - **All pool slots held.** Driver tags `sometimes(...)` for
+    visibility and exits cleanly. With the default pool size (8) and
+    the test composer's normal concurrency this is not expected to
+    fire, but if it does we'll see it in the run report.
+
+  - **Crash before drop-on-exit runs.** The flock is released
+    automatically when the process dies (kernel-level lock release).
+    The clusterd is left holding stale state until the next claimant
+    reconciles. Catalog leftovers (`cluster-{seed}-*`,
+    `role-{seed}-*`, `db-pw-{seed}-*`) accumulate until the next
+    invocation with the same seed runs its setup sweep — extremely
+    unlikely since seeds are u64-random. The setup sweep is scoped
+    to the current seed only, so it does not clean cross-invocation
+    leftovers. A periodic external cleanup or a startup-time scan
+    against `mz_clusters` / `mz_roles` / `mz_databases` would be
+    needed to close this loop properly. For now the catalog growth
+    is bounded by run length and not currently a problem.
+
+  - **Pool sizing wrong vs concurrency.** If concurrency exceeds pool
+    size, the late arrivals get "no slot" and exit. We do not currently
+    auto-tune; bump `ANTITHESIS_CLUSTERD_POOL_SIZE` if telemetry shows
+    the "no slot available" signal firing.
+
+## v1 limitations (future work)
+
+  - **REPLICATION FACTOR 1, no multi-replica parallel-workload coverage.**
+    The pool gives each invocation one container; multi-replica
+    coverage for compute/storage paths remains in `antithesis_cluster`.
+
+  - **No in-band allocator inside the framework.** Worker threads
+    cannot grab additional pool members mid-run, so
+    `CreateClusterAction` / `CreateClusterReplicaAction` /
+    `DropClusterReplicaAction` are skipped when pool-backed. The
+    framework only ever touches the pre-allocated pool members.
+
+  - **No global GC of cross-invocation catalog leftovers.** See
+    failure modes above. A first-invocation sweep against
+    `mz_clusters WHERE name LIKE 'cluster-%-%'` minus the current
+    seed would close this; deferred until it becomes a problem.
+
+## Tunables
+
+| Variable | Default | Effect |
+|---|---|---|
+| `ANTITHESIS_CLUSTERD_POOL_SIZE` (compose) | 8 | Number of `clusterd-pool-{i}` containers deployed. |
+| `CLUSTERD_POOL_SIZE` (driver) | 8 | Number of slots the driver will attempt to claim. Must match the compose value. |
+| `CLUSTERD_POOL_SLOT_LOCK_DIR` (driver) | `/tmp/clusterd-pool-slots` | Directory holding the per-slot flock files. |
+| `PW_RUNTIME_S` (driver) | 20 | Per-invocation runtime; bound to keep the fault-injection budget granular. |
+| `PW_THREADS` (driver) | 4 | Worker threads inside one invocation. |

From 84bdebe000191e38e953108067bde19d5f997216 Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Wed, 13 May 2026 19:04:17 -0400
Subject: [PATCH 46/65] parallel_workload: pool mode provisions one cluster
 with N replicas, not N single-replica clusters

The driver now claims best-effort up to PW_DESIRED_REPLICAS (default 2)
clusterd-pool slots per invocation; the framework consumes the whole
pool_members list into a single unmanaged cluster with one replica per
member instead of one cluster per member. Gives multi-replica fault
coverage to parallel-workload (previously only antithesis_cluster ran
multi-replica) when pool capacity allows, and degrades gracefully to a
single-replica cluster under contention.

The driver helper renamed from _claim_pool_slot (yields one slot or None)
to _claim_pool_slots (yields list of 0..desired slots) so the contention
fallback is just a shorter list rather than a special case.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../materialize/parallel_workload/database.py |  19 ++-
 .../parallel-workload-fault-isolation.md      |  36 ++---
 .../test/parallel_driver_parallel_workload.py | 151 ++++++++++--------
 3 files changed, 109 insertions(+), 97 deletions(-)

diff --git a/misc/python/materialize/parallel_workload/database.py b/misc/python/materialize/parallel_workload/database.py
index 4eecbdf2ea2f3..be639f9145f82 100644
--- a/misc/python/materialize/parallel_workload/database.py
+++ b/misc/python/materialize/parallel_workload/database.py
@@ -1205,26 +1205,25 @@ def __init__(
         ]
         self.role_id = len(self.roles)
         # At least one storage cluster required for WebhookSources.
-        # In pool mode, each cluster claims one pool member from a
-        # deterministic slice; the number of clusters is the slice size, no
-        # rng.randint. Caller is responsible for sizing `pool_members` to
-        # the desired cluster count.
+        # In pool mode, the entire `pool_members` list is consumed by a
+        # single unmanaged cluster — one replica per member — so the
+        # caller controls both replica count and pool-member identity.
+        # This is the only initial cluster; CreateClusterAction is
+        # disabled in pool mode (no in-band allocator).
         if pool_members is not None:
-            initial_cluster_count = len(pool_members)
             self.clusters = [
                 Cluster(
-                    i,
+                    0,
                     # managed/size are ignored when pool-backed but kept as
                     # placeholder values for any code that reads them
                     # without consulting `is_pool_backed`.
                     managed=False,
-                    size=pool_members[i].host,
-                    replication_factor=1,
+                    size=pool_members[0].host,
+                    replication_factor=len(pool_members),
                     introspection_interval="1s",
                     name_scope=self.name_scope,
-                    pool_members=[pool_members[i]],
+                    pool_members=pool_members,
                 )
-                for i in range(initial_cluster_count)
             ]
         else:
             self.clusters = [
diff --git a/test/antithesis/scratchbook/parallel-workload-fault-isolation.md b/test/antithesis/scratchbook/parallel-workload-fault-isolation.md
index 8cbe235b8183b..f9e182c043ab5 100644
--- a/test/antithesis/scratchbook/parallel-workload-fault-isolation.md
+++ b/test/antithesis/scratchbook/parallel-workload-fault-isolation.md
@@ -27,10 +27,12 @@ environmentd's process orchestrator and the fault domain disappears.
 ## Solution
 
 A pool of identical pre-deployed clusterd containers
-(`clusterd-pool-{0..N-1}`), one container per parallel-workload
-invocation. Each invocation claims one slot via filesystem locking,
-provisions its sole cluster as an unmanaged replica pointed at that
-slot's clusterd, and releases the slot on exit.
+(`clusterd-pool-{0..N-1}`). Each invocation claims up to
+`PW_DESIRED_REPLICAS` (default 2) slots via filesystem locking and
+provisions a single unmanaged cluster with one replica per claimed
+slot, then releases the locks on exit. Best-effort: with N slots
+claimed the cluster runs as an N-replica cluster (1 ≤ N ≤ desired);
+no slots → exit cleanly.
 
 Components, bottom-up:
 
@@ -43,23 +45,24 @@ Components, bottom-up:
 
   - **`parallel_workload.Database(pool_members=...,
     seed_scoped_names=True)`**. Opt-in framework mode: when
-    `pool_members` is set, the framework provisions unmanaged clusters
-    with explicit STORAGECTL/STORAGE/COMPUTECTL/COMPUTE ADDRESSES
-    instead of managed SIZE/REPLICATION FACTOR; the CreateCluster /
+    `pool_members` is set, the framework provisions one unmanaged
+    cluster with `len(pool_members)` replicas, each pointed at a pool
+    member via explicit STORAGECTL/STORAGE/COMPUTECTL/COMPUTE ADDRESSES
+    (in place of managed SIZE/REPLICATION FACTOR); the CreateCluster /
     CreateReplica / DropReplica actions skip pool-backed clusters
     because there is no in-band allocator. `seed_scoped_names=True`
     renames `cluster{N}` / `role{N}` to `cluster-{seed}-{N}` /
     `role-{seed}-{N}` so concurrent invocations don't collide on
     global names.
 
-  - **`_claim_pool_slot()`** in
+  - **`_claim_pool_slots()`** in
     `test/antithesis/workload/test/parallel_driver_parallel_workload.py`.
-    Contextmanager that holds `fcntl.flock(LOCK_EX | LOCK_NB)` on
-    `/tmp/clusterd-pool-slots/{i}.lock` for the lifetime of the
-    invocation. Slots tried in randomized order so allocation is
-    decorrelated from invocation seed. The lock is released on context
-    exit (normal or exception), so a crashing driver doesn't strand the
-    slot.
+    Contextmanager that holds up to `PW_DESIRED_REPLICAS` exclusive
+    `fcntl.flock`s on `/tmp/clusterd-pool-slots/{i}.lock` for the
+    lifetime of the invocation. Slots are tried in randomized order so
+    allocation is decorrelated from invocation seed. Every claimed lock
+    is released on context exit (normal or exception), so a crashing
+    driver doesn't strand any slot.
 
   - **`_drop_seed_scoped_objects()`** in the same driver, called in
     `main()`'s `finally`. Drops every cluster / database / role whose
@@ -137,10 +140,6 @@ uses the same `ClusterSpec` pattern).
 
 ## v1 limitations (future work)
 
-  - **REPLICATION FACTOR 1, no multi-replica parallel-workload coverage.**
-    The pool gives each invocation one container; multi-replica
-    coverage for compute/storage paths remains in `antithesis_cluster`.
-
   - **No in-band allocator inside the framework.** Worker threads
     cannot grab additional pool members mid-run, so
     `CreateClusterAction` / `CreateClusterReplicaAction` /
@@ -159,5 +158,6 @@ uses the same `ClusterSpec` pattern).
 | `ANTITHESIS_CLUSTERD_POOL_SIZE` (compose) | 8 | Number of `clusterd-pool-{i}` containers deployed. |
 | `CLUSTERD_POOL_SIZE` (driver) | 8 | Number of slots the driver will attempt to claim. Must match the compose value. |
 | `CLUSTERD_POOL_SLOT_LOCK_DIR` (driver) | `/tmp/clusterd-pool-slots` | Directory holding the per-slot flock files. |
+| `PW_DESIRED_REPLICAS` (driver) | 2 | Replicas to ask for per invocation's cluster. Best-effort: driver claims up to this many slots and runs with whatever it gets (≥1). |
 | `PW_RUNTIME_S` (driver) | 20 | Per-invocation runtime; bound to keep the fault-injection budget granular. |
 | `PW_THREADS` (driver) | 4 | Worker threads inside one invocation. |
diff --git a/test/antithesis/workload/test/parallel_driver_parallel_workload.py b/test/antithesis/workload/test/parallel_driver_parallel_workload.py
index 7d370f2048222..5929f88327590 100644
--- a/test/antithesis/workload/test/parallel_driver_parallel_workload.py
+++ b/test/antithesis/workload/test/parallel_driver_parallel_workload.py
@@ -111,10 +111,10 @@
 # Number of clusterd-pool-{i} containers reserved for the parallel-workload
 # driver. Must match the pool actually deployed in
 # test/antithesis/mzcompose.py (ANTITHESIS_CLUSTERD_POOL_SIZE there →
-# CLUSTERD_POOL_SIZE here). Each parallel-workload invocation claims one
-# slot via `fcntl.flock` (see `_claim_pool_slot`); the lock is held for
-# the lifetime of the invocation so concurrent driver processes inside
-# the workload container can't pick the same clusterd.
+# CLUSTERD_POOL_SIZE here). Each parallel-workload invocation claims
+# slots via `fcntl.flock` (see `_claim_pool_slots`); the locks are held
+# for the lifetime of the invocation so concurrent driver processes
+# inside the workload container can't pick the same clusterd.
 CLUSTERD_POOL_SIZE = int(os.environ.get("CLUSTERD_POOL_SIZE", "8"))
 
 # Workers configured per clusterd-pool-{i} process. Must match the
@@ -123,6 +123,14 @@
 # from what clusterd actually runs.
 CLUSTERD_POOL_WORKERS = 4
 
+# Replicas to ask for per invocation's cluster. Best-effort: the driver
+# claims up to this many pool slots and runs whatever it gets (≥1). With
+# DESIRED_REPLICAS=2 and POOL_SIZE=8 we get multi-replica coverage for
+# the parallel-workload cluster (currently only `antithesis_cluster` is
+# multi-replica) when capacity allows, while degrading gracefully to a
+# single-replica cluster under contention.
+DESIRED_REPLICAS = int(os.environ.get("PW_DESIRED_REPLICAS", "2"))
+
 # Filesystem locks let concurrent parallel-workload invocations claim
 # distinct clusterd-pool members without coordinating through the SUT.
 # All invocations exec inside the single `workload` container so a
@@ -251,51 +259,52 @@ def _tolerate_setup_race(fn, *args, **kwargs):
 
 
 @contextlib.contextmanager
-def _claim_pool_slot(rng: random.Random):
-    """Hold an exclusive `fcntl.flock` on a pool-slot lockfile for the
-    duration of the `with` block. Yields the slot index, or `None` if every
-    slot is busy.
-
-    Slots are tried in a randomized order so the slot a driver lands on
-    doesn't correlate with deterministic state (test composer seed, wall
-    clock). The lock is released when the context exits — either normally
-    or via exception — so a crashing driver doesn't strand the slot.
-
-    All parallel-workload driver invocations share the workload container's
-    filesystem, so a plain flock on a tmpfs path under `POOL_SLOT_LOCK_DIR`
-    is sufficient to serialize claims. If the path can't be created we fall
-    back to yielding `None` (caller must handle: the existing setup-tolerance
-    path can absorb a slot collision, it just costs us pool isolation for
-    that one invocation).
+def _claim_pool_slots(rng: random.Random, desired: int):
+    """Hold exclusive `fcntl.flock`s on up to `desired` pool-slot lockfiles
+    for the duration of the `with` block. Yields the list of claimed slot
+    indices (length 0–`desired`); the caller decides what to do with each
+    population (1 = single-replica fallback, ≥2 = multi-replica cluster,
+    0 = no slots available, exit cleanly).
+
+    Slots are tried in randomized order so allocation is decorrelated
+    from invocation seed / wall clock. Every claimed flock is released
+    when the context exits — normally or via exception — so a crashing
+    driver doesn't strand any slot.
+
+    All parallel-workload driver invocations share the workload
+    container's filesystem, so plain flock on a tmpfs path under
+    `POOL_SLOT_LOCK_DIR` is sufficient serialization (no cross-container
+    coordination required).
     """
     try:
         os.makedirs(POOL_SLOT_LOCK_DIR, exist_ok=True)
     except OSError as exc:
         LOG.warning("pool slot lock dir %s unavailable: %s", POOL_SLOT_LOCK_DIR, exc)
-        yield None
+        yield []
         return
 
     slots = list(range(CLUSTERD_POOL_SIZE))
     rng.shuffle(slots)
-    for slot in slots:
-        path = os.path.join(POOL_SLOT_LOCK_DIR, f"{slot}.lock")
-        fd = os.open(path, os.O_RDWR | os.O_CREAT, 0o600)
-        try:
-            fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
-        except OSError:
-            # Another invocation owns this slot; try the next one.
-            os.close(fd)
-            continue
-        try:
-            yield slot
-        finally:
+    held: list[tuple[int, int]] = []  # (slot, fd)
+    try:
+        for slot in slots:
+            if len(held) >= desired:
+                break
+            path = os.path.join(POOL_SLOT_LOCK_DIR, f"{slot}.lock")
+            fd = os.open(path, os.O_RDWR | os.O_CREAT, 0o600)
+            try:
+                fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
+            except OSError:
+                os.close(fd)
+                continue
+            held.append((slot, fd))
+        yield [slot for slot, _ in held]
+    finally:
+        for _, fd in held:
             try:
                 fcntl.flock(fd, fcntl.LOCK_UN)
             finally:
                 os.close(fd)
-        return
-    LOG.warning("all %d pool slots are claimed; running without isolation", CLUSTERD_POOL_SIZE)
-    yield None
 
 
 def _drop_seed_scoped_objects(seed: str) -> None:
@@ -482,50 +491,54 @@ def main() -> int:
 
     _prepare_system(NUM_THREADS)
 
-    # Claim one clusterd-pool-{i} container for this invocation. The flock
-    # is held until main() returns; concurrent invocations inside the
-    # workload container can't pick the same slot. If every slot is busy
-    # the context manager yields `None` — we tag that with a sometimes()
-    # for visibility and exit cleanly (the property surface for this
-    # invocation just doesn't get exercised).
+    # Claim up to DESIRED_REPLICAS pool slots; the cluster runs with as
+    # many replicas as we got (≥1). Locks are held until main() returns;
+    # if no slot is free we tag a sometimes() and exit cleanly.
     #
-    # Each parallel-workload cluster lands on its own clusterd-pool-{slot}
-    # container, giving Antithesis per-cluster fault isolation. Without
-    # this, every parallel-workload cluster would be a child process of
-    # environmentd under the materialized container's process orchestrator,
-    # and the only container-level fault would be "the whole world".
-    with _claim_pool_slot(rng) as pool_slot:
+    # Each replica lands on its own clusterd-pool-{slot} container, so
+    # Antithesis can fault one replica's container without taking the
+    # cluster offline — exercises the same multi-replica recovery paths
+    # `antithesis_cluster` covers, but on the workload-driven cluster.
+    with _claim_pool_slots(rng, DESIRED_REPLICAS) as pool_slots:
+        sometimes(
+            len(pool_slots) > 0,
+            "parallel workload: clusterd pool slots claimed",
+            {"pool_size": CLUSTERD_POOL_SIZE, "claimed": len(pool_slots)},
+        )
         sometimes(
-            pool_slot is not None,
-            "parallel workload: clusterd pool slot claimed",
-            {"pool_size": CLUSTERD_POOL_SIZE},
+            len(pool_slots) >= DESIRED_REPLICAS,
+            "parallel workload: full multi-replica pool claim",
+            {"pool_size": CLUSTERD_POOL_SIZE, "desired": DESIRED_REPLICAS},
         )
-        if pool_slot is None:
+        if not pool_slots:
             LOG.info(
-                "parallel-workload exiting cleanly: no pool slot available "
+                "parallel-workload exiting cleanly: no pool slots available "
                 "(pool_size=%d)",
                 CLUSTERD_POOL_SIZE,
             )
             return 0
-        pool_member = ClusterdPoolMember(
-            host=f"clusterd-pool-{pool_slot}",
-            workers=CLUSTERD_POOL_WORKERS,
-        )
+        pool_members = [
+            ClusterdPoolMember(
+                host=f"clusterd-pool-{slot}",
+                workers=CLUSTERD_POOL_WORKERS,
+            )
+            for slot in pool_slots
+        ]
         LOG.info(
-            "parallel-workload claimed pool slot %d (%s)",
-            pool_slot,
-            pool_member.host,
+            "parallel-workload claimed %d pool slot(s): %s",
+            len(pool_slots),
+            ", ".join(m.host for m in pool_members),
         )
-        return _run_invocation(seed, rng, pool_member)
+        return _run_invocation(seed, rng, pool_members)
 
 
 def _run_invocation(
     seed: str,
     rng: random.Random,
-    pool_member: ClusterdPoolMember,
+    pool_members: list[ClusterdPoolMember],
 ) -> int:
-    """The bulk of `main()` once a pool slot has been claimed. Split out so
-    the slot lock stays held across this whole call: the lock is released
+    """The bulk of `main()` once pool slot(s) have been claimed. Split out
+    so the slot locks stay held across this whole call: they are released
     when the enclosing `with` block in `main()` exits.
     """
 
@@ -537,10 +550,10 @@ def _run_invocation(
     # concurrent invocations share the SUT — see _SETUP_RACE_PATTERNS for
     # the fallback when they collide anyway.
     #
-    # `pool_members=[pool_member]` puts this invocation's single cluster
-    # on the pool member above; the framework forces managed=False and
-    # emits unmanaged CREATE CLUSTER with explicit STORAGECTL/COMPUTE
-    # ADDRESSES.
+    # `pool_members=pool_members` makes a single unmanaged cluster with one
+    # replica per member; the framework forces managed=False and emits
+    # unmanaged CREATE CLUSTER with explicit STORAGECTL/COMPUTE ADDRESSES
+    # for each replica.
     database = Database(
         rng=rng,
         seed=seed,
@@ -556,7 +569,7 @@ def _run_invocation(
         scenario=Scenario.Kill,
         naughty_identifiers=False,
         seed_scoped_names=True,
-        pool_members=[pool_member],
+        pool_members=pool_members,
     )
 
     end_time = time.time() + RUNTIME_S

From bb766ee19283c8cf56ff529a22cafbeca7434985 Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Wed, 13 May 2026 19:42:23 -0400
Subject: [PATCH 47/65] test/antithesis: tolerate Antithesis fault-injection
 errors in parallel-workload setup and worker phases
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three fault-injection shapes were escaping the parallel-workload driver as
'unexpected' SQL errors and tripping the always() assertion:

  1. SETUP: "Failed to resolve hostname" during CREATE CONNECTION FOR
     KAFKA — materialized's broker-validation can't reach `kafka`
     because Antithesis paused the kafka container or partitioned DNS.
  2. SETUP: "connection timeout expired" / "server closed the
     connection unexpectedly" from the driver's psycopg.connect to
     `materialized:6875` — Antithesis paused the materialized container
     during setup.
  3. WORKER: "thread pw-worker-N exited early" with no captured cause
     — `Worker.run`'s initial psycopg.connect / websocket / SET
     statements run outside any try/except, so a fault that lands during
     worker startup kills the thread without populating
     `occurred_exception`. The driver's worker_failed payload then
     reports just the thread-exit-early string with no underlying cause.

None of these are SUT correctness issues — they're the expected cost of
fault injection landing at inconvenient moments.

The fix adds a second tolerance category, _SETUP_FAULT_PATTERNS, alongside
the existing _SETUP_RACE_PATTERNS, and:

  * Inside _tolerate_setup_race: swallow per-statement exceptions whose
    message matches either pattern set.
  * Around the whole setup phase: if setup_failure matches, demote out of
    the always() assertion and into a sometimes() for visibility.
  * Around worker thread death: if occurred_exception is None
    (fault-killed startup) or its message matches a fault pattern,
    demote out of the always() assertion and into a sometimes(). A
    worker that captures a non-fault exception still fails the run as
    before.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../test/parallel_driver_parallel_workload.py | 120 ++++++++++++++++--
 1 file changed, 111 insertions(+), 9 deletions(-)

diff --git a/test/antithesis/workload/test/parallel_driver_parallel_workload.py b/test/antithesis/workload/test/parallel_driver_parallel_workload.py
index 5929f88327590..edbce1b622636 100644
--- a/test/antithesis/workload/test/parallel_driver_parallel_workload.py
+++ b/test/antithesis/workload/test/parallel_driver_parallel_workload.py
@@ -233,26 +233,96 @@ def _prepare_system(num_threads: int) -> None:
     "object state changed while transaction was in progress",
 )
 
+# Substring matches for setup-phase errors caused by Antithesis fault
+# injection rather than workload misuse. Antithesis can pause any container
+# (materialized, kafka, postgres-metadata, minio) at any time; if a pause
+# lands during the driver's setup phase we see one of these shapes. The
+# distinction matters because the always() assertion at the bottom of main()
+# treats setup failures as unexpected by default — a connection timeout to
+# materialized or a hostname-resolve failure for kafka is fault timing, not
+# a correctness bug to fail the run on.
+#
+#   "connection timeout"        — driver's psycopg.connect to materialized
+#                                 timed out (materialized container paused).
+#   "Multiple connection attempts failed"
+#                               — same shape, retry-exhaustion wording.
+#   "Failed to resolve hostname"— materialized's CREATE CONNECTION
+#                                 validation can't resolve `kafka` (kafka
+#                                 container paused or DNS path partitioned).
+#   "connection refused"        — target is up but socket closed; transient.
+#   "connection reset"          — TCP reset during a fault.
+#   "broken pipe"               — write to a socket the peer closed.
+#   "EOF detected"              — psycopg's wording for peer-closed during
+#                                 query.
+#   "server closed the connection unexpectedly"
+#                               — common psycopg flavour.
+_SETUP_FAULT_PATTERNS = (
+    "connection timeout",
+    "Multiple connection attempts failed",
+    "Failed to resolve hostname",
+    "connection refused",
+    "connection reset",
+    "broken pipe",
+    "EOF detected",
+    "server closed the connection unexpectedly",
+)
+
+
+def _matches_setup_tolerance(exc: BaseException) -> bool:
+    """True if `exc` is a setup-phase error we expect to see under either
+    concurrent-driver races or Antithesis fault injection. Used both inside
+    `_tolerate_setup_race` (to swallow per-statement) and around the whole
+    setup phase (to demote setup_failure from unexpected to a sometimes
+    signal).
+    """
+    msg = getattr(exc, "msg", None) or str(exc)
+    return any(
+        pat in msg for pat in (*_SETUP_RACE_PATTERNS, *_SETUP_FAULT_PATTERNS)
+    )
+
+
+def _worker_death_tolerable(occurred: Exception | None) -> bool:
+    """True when an early-exiting worker thread is plausibly a fault-injection
+    casualty rather than a bug to fail the run on.
+
+    `parallel_workload.worker.Worker.run` performs its initial
+    `psycopg.connect` / websocket / `SET` statements outside any try/except,
+    so a fault that lands during worker startup kills the thread with
+    `occurred_exception = None` (no captured cause). Once the worker is
+    inside its main action loop, captured `QueryError`s that don't match
+    `errors_to_ignore` populate `occurred_exception` — those are the ones
+    we want to look at. If the captured exception matches a fault shape
+    (`_SETUP_FAULT_PATTERNS`) it's still the fault that killed the worker,
+    not a SUT correctness bug.
+    """
+    if occurred is None:
+        return True
+    return _matches_setup_tolerance(occurred)
+
 
 def _tolerate_setup_race(fn, *args, **kwargs):
-    """Run `fn(...)`, swallowing the concurrent-race messages in
-    `_SETUP_RACE_PATTERNS` and propagating anything else.
+    """Run `fn(...)`, swallowing the messages in `_SETUP_RACE_PATTERNS` or
+    `_SETUP_FAULT_PATTERNS` and propagating anything else.
 
     The setup phase is invoked by every parallel-driver invocation, and the
     framework picks deterministic object names from a small pool. Concurrent
-    invocations therefore race to drop-then-create the same names; any single
-    race outcome is fine because the per-invocation Database object only
-    needs its named objects to exist by the time worker threads start.
+    invocations therefore race to drop-then-create the same names; any
+    single race outcome is fine because the per-invocation Database object
+    only needs its named objects to exist by the time worker threads start.
+
+    Fault-induced errors (container paused, DNS partitioned, socket reset)
+    are absorbed for the same reason: they're expected under Antithesis,
+    not workload bugs.
     """
     try:
         return fn(*args, **kwargs)
     except QueryError as exc:
-        if any(pat in (exc.msg or "") for pat in _SETUP_RACE_PATTERNS):
+        if _matches_setup_tolerance(exc):
             LOG.debug("setup tolerated: %s — %s", exc.query, exc.msg)
             return None
         raise
     except Exception as exc:  # noqa: BLE001
-        if any(pat in str(exc) for pat in _SETUP_RACE_PATTERNS):
+        if _matches_setup_tolerance(exc):
             LOG.debug("setup tolerated: %s", exc)
             return None
         raise
@@ -649,10 +719,42 @@ def _run_invocation(
         {"ignored_errors": total_ignored},
     )
 
+    # Setup-phase failures whose message matches `_SETUP_*_PATTERNS` are
+    # either concurrent-driver races or Antithesis fault-injection
+    # consequences (paused container, partitioned DNS, reset socket).
+    # Neither is a SUT correctness issue, so demote them out of the
+    # `always(...)` assertion and into a `sometimes(...)` for visibility.
+    setup_tolerated = setup_failure is not None and _matches_setup_tolerance(
+        setup_failure
+    )
+    sometimes(
+        setup_tolerated,
+        "parallel workload: setup phase tolerated a fault-injection or race error",
+        {"error": str(setup_failure) if setup_failure else None},
+    )
+
+    # Worker-thread death under fault injection has the same
+    # "expected-not-a-bug" shape: an uncaptured-exception death (typically
+    # initial psycopg.connect failing because materialized was paused) or a
+    # captured exception whose message matches a fault pattern.
+    worker_tolerated = worker_failed is not None and _worker_death_tolerable(
+        worker_failed.cause
+    )
+    sometimes(
+        worker_tolerated,
+        "parallel workload: worker thread death tolerated as fault-injection consequence",
+        {
+            "error": (
+                str(worker_failed.cause) if worker_failed and worker_failed.cause else None
+            ),
+            "uncaptured": worker_failed is not None and worker_failed.cause is None,
+        },
+    )
+
     unexpected = None
-    if setup_failure is not None:
+    if setup_failure is not None and not setup_tolerated:
         unexpected = {"phase": "setup", "error": str(setup_failure)}
-    elif worker_failed is not None:
+    elif worker_failed is not None and not worker_tolerated:
         unexpected = {
             "phase": "worker",
             "error": (

From ff76a2767da2f76672d64258389ac1a019ae9199 Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Thu, 14 May 2026 10:31:56 -0400
Subject: [PATCH 48/65] test/antithesis: pivot pool design to permanent pool
 clusters
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous design (CREATE+DROP CLUSTER per parallel-workload
invocation, targeting a pool clusterd) hit a hard clusterd halt on the
second invocation against the same slot:

  WARN: halting process: new instance configuration not compatible with
  existing instance configuration: ... index_logs:
  IntrospectionSourceIndex(...876897) vs Some(IntrospectionSourceIndex(...876641))

InstanceConfig::compatible_with compares LoggingConfig, which includes
the per-cluster IntrospectionSourceIndex GlobalIds — those are freshly
allocated by every CREATE CLUSTER. Pointing a different cluster
identity at a clusterd that already saw a prior cluster's introspection
indexes trips this check. reconcile() handles environmentd restarts
against the same cluster identity, but not cluster-identity changes.

Pivot to permanent pool clusters: one long-lived pool_cluster_<i>
per clusterd-pool-<i>, bootstrapped by the workload-entrypoint once
at compose-up and never dropped. Each parallel-workload invocation
picks a slot at random and runs against the pool cluster for that
slot. The cluster identity stays stable per slot, so the only
reconnect events the pool clusterds see are environmentd restarts and
Antithesis-injected pauses of the pool clusterd itself — the path
reconcile is designed for.

Concurrent invocations may share a pool cluster: every workload object
is in a seed-scoped database (db-pw-<seed>-*) with seed-scoped roles,
so DDL/DML never collides across invocations. No coordination
required — no fcntl.flock, no slot-claim contextmanager, no "no slot
available, exit cleanly" fallback. Antithesis still faults containers,
not invocations, so the per-container fault domain is preserved; two
invocations witnessing the same fault simply give us more independent
reproductions per failure.

Framework changes:
  * Cluster.pre_existing_name: when set, create()/drop() are no-ops,
    name() returns the literal, is_pool_backed flips True for action
    skips.
  * Database.existing_cluster_name: replaces pool_members on the
    Antithesis path; wraps one pre-existing cluster.
  * ClusterReplica loses its pool_member field (no longer used).
  * action.CreateClusterAction checks existing_cluster_name (was
    pool_members).

Driver changes:
  * Slot pick is rng.randrange(CLUSTERD_POOL_SIZE). No coordination.
  * _drop_seed_scoped_objects stops dropping clusters. The seed-scoped
    database drop cascades through every workload-created object on
    the cluster, returning the bound clusterd to an idle baseline.
  * Setup-phase pre-create sweep no longer touches mz_clusters.

mzcompose / entrypoint changes:
  * Workload service env now exports ANTITHESIS_CLUSTERD_POOL_SIZE +
    CLUSTERD_POOL_SIZE so the bootstrap script and the driver agree
    on the slot count.
  * workload-entrypoint.sh loops POOL_SIZE times and CREATEs each
    pool_cluster_<i> on its clusterd-pool-<i> (idempotent across
    compose-up).

Multi-replica parallel-workload coverage is gone in this iteration —
each pool cluster has one replica. Multi-replica coverage stays in
antithesis_cluster; a future revision could pair pool clusterds into
2-replica pool clusters at the cost of doubling the pool footprint.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../materialize/parallel_workload/action.py   |  22 +-
 .../materialize/parallel_workload/database.py | 139 +++++------
 test/antithesis/config/docker-compose.yaml    |   2 +
 test/antithesis/mzcompose.py                  |   6 +
 .../parallel-workload-fault-isolation.md      | 217 +++++++++--------
 .../test/parallel_driver_parallel_workload.py | 223 ++++++------------
 .../workload/workload-entrypoint.sh           |  40 ++++
 7 files changed, 297 insertions(+), 352 deletions(-)

diff --git a/misc/python/materialize/parallel_workload/action.py b/misc/python/materialize/parallel_workload/action.py
index 89d9c0801fc76..60161d9d1fe74 100644
--- a/misc/python/materialize/parallel_workload/action.py
+++ b/misc/python/materialize/parallel_workload/action.py
@@ -2026,12 +2026,12 @@ def run(self, exe: Executor) -> bool:
 
 class CreateClusterAction(Action):
     def run(self, exe: Executor) -> bool:
-        # In pool mode the Database's clusters are wired to pre-existing
-        # clusterd containers from a finite pool the caller passed in.
-        # Dynamically creating a new cluster would need to claim an unused
-        # pool member, and we don't have an allocator. Skip — the initial
-        # clusters set up at construction time are the test surface.
-        if exe.db.pool_members is not None:
+        # In existing-cluster mode the Database wraps a pre-existing
+        # (caller-supplied) cluster, typically bootstrapped by the
+        # Antithesis compose, and we have no allocator for additional
+        # clusters tied to other pool members. Skip — the wrapped
+        # cluster is the entire test surface.
+        if exe.db.existing_cluster_name is not None:
             return False
         with exe.db.lock:
             if len(exe.db.clusters) >= MAX_CLUSTERS:
@@ -2178,9 +2178,8 @@ def run(self, exe: Executor) -> bool:
         with exe.db.lock:
             # Keep cluster 0 with 1 replica for sources/sinks
             unmanaged_clusters = [c for c in exe.db.clusters[1:] if not c.managed]
-            # Pool-backed clusters can't grow their replica count — there's
-            # no pool allocator handing out a fresh ClusterdPoolMember per
-            # ALTER CLUSTER ADD REPLICA. Skip them.
+            # Pre-existing (pool) clusters: the framework didn't create them
+            # and won't mutate them. Skip.
             unmanaged_clusters = [
                 c for c in unmanaged_clusters if not c.is_pool_backed
             ]
@@ -2207,10 +2206,7 @@ def run(self, exe: Executor) -> bool:
         with exe.db.lock:
             # Keep cluster 0 with 1 replica for sources/sinks
             unmanaged_clusters = [c for c in exe.db.clusters[1:] if not c.managed]
-            # Pool-backed clusters can't shrink either — without an
-            # allocator to release the pool member back, the in-memory
-            # model would diverge from materialize's catalog and later
-            # creates targeting the freed slot would conflict.
+            # Pre-existing (pool) clusters: same reasoning as above. Skip.
             unmanaged_clusters = [
                 c for c in unmanaged_clusters if not c.is_pool_backed
             ]
diff --git a/misc/python/materialize/parallel_workload/database.py b/misc/python/materialize/parallel_workload/database.py
index be639f9145f82..7034be033dae8 100644
--- a/misc/python/materialize/parallel_workload/database.py
+++ b/misc/python/materialize/parallel_workload/database.py
@@ -915,16 +915,17 @@ def create(self, exe: Executor) -> None:
 
 @dataclasses.dataclass(frozen=True)
 class ClusterdPoolMember:
-    """One entry in an external clusterd pool that a `Cluster` can target as
-    an unmanaged replica.
+    """Address+config of one external clusterd container the SUT will host an
+    unmanaged cluster replica on.
 
-    Used by callers (Antithesis parallel-driver) that want fault-isolation
-    per cluster: each pool member is its own container, so Antithesis can
-    kill/pause/partition exactly one cluster's storage+compute without
-    taking down the other clusters that share the materialized container's
-    process orchestrator.
+    Used by the Antithesis compose bootstrap (see test/antithesis/mzcompose.py)
+    to build the CREATE CLUSTER REPLICAS clause for each long-lived pool
+    cluster: one cluster per pool member, with this member as its sole
+    replica. After bootstrap the framework only references the cluster by
+    name (`existing_cluster_name`); pool members aren't passed into
+    `Database` directly.
 
-    The default ports match clusterd's defaults; override per environment.
+    Default ports match clusterd's defaults; override per environment.
     """
 
     host: str
@@ -941,26 +942,18 @@ class ClusterReplica:
     cluster: "Cluster"
     rename: int
     lock: threading.Lock
-    # When non-None, the replica is wired to a pre-existing clusterd
-    # container via unmanaged-cluster syntax (STORAGECTL/COMPUTE ADDRESSES)
-    # rather than provisioned through the orchestrator. The replica's
-    # `size` field is ignored in that case; `pool_member.workers` provides
-    # the WORKERS clause.
-    pool_member: ClusterdPoolMember | None
 
     def __init__(
         self,
         replica_id: int,
         size: str,
         cluster: "Cluster",
-        pool_member: ClusterdPoolMember | None = None,
     ):
         self.replica_id = replica_id
         self.size = size
         self.cluster = cluster
         self.rename = 0
         self.lock = threading.Lock()
-        self.pool_member = pool_member
 
     def name(self) -> str:
         if self.rename:
@@ -992,6 +985,12 @@ class Cluster:
     # parallel-driver, where many concurrent Database instances against one
     # materialize would otherwise collide on the same `cluster-N` names.
     name_scope: str
+    # When set, the cluster represents a pre-existing cluster the framework
+    # did not create and must not drop. `name()` returns this literally
+    # (bypassing cluster_id / rename / name_scope), and `create()` / `drop()`
+    # are no-ops. The replicas list is empty in this mode — the framework
+    # doesn't model the pre-existing replicas because it never touches them.
+    pre_existing_name: str | None
 
     def __init__(
         self,
@@ -1001,26 +1000,20 @@ def __init__(
         replication_factor: int,
         introspection_interval: str,
         name_scope: str = "",
-        pool_members: list[ClusterdPoolMember] | None = None,
+        pre_existing_name: str | None = None,
     ):
         self.cluster_id = cluster_id
         self.managed = managed
         self.size = size
-        # When `pool_members` is supplied, the cluster runs in unmanaged mode
-        # against one pre-existing clusterd container per replica. We force
-        # `managed=False` (the unmanaged-cluster syntax is what carries the
-        # STORAGECTL/COMPUTE ADDRESSES clauses) and ignore `replication_factor`
-        # in favour of `len(pool_members)`.
-        if pool_members is not None:
-            if not pool_members:
-                raise ValueError(
-                    "pool_members must be non-empty when provided; one member per replica"
-                )
+        self.pre_existing_name = pre_existing_name
+        if pre_existing_name is not None:
+            # Pre-existing cluster: framework only models its name. The actual
+            # replicas live in materialize's catalog from the bootstrap step
+            # that created the cluster (see test/antithesis/mzcompose.py).
+            # Empty replicas list flips `is_pool_backed` to True, which is
+            # what the action classes use to skip DDL on this cluster.
             self.managed = False
-            self.replicas = [
-                ClusterReplica(i, size, self, pool_member=pool_members[i])
-                for i in range(len(pool_members))
-            ]
+            self.replicas = []
         else:
             self.replicas = [
                 ClusterReplica(i, size, self) for i in range(replication_factor)
@@ -1033,13 +1026,19 @@ def __init__(
 
     @property
     def is_pool_backed(self) -> bool:
-        """True iff every replica is wired to a pre-existing clusterd
-        container rather than provisioned through the orchestrator. Action
-        classes that would mutate replica count check this and bail —
-        we don't dynamically allocate from the pool."""
-        return all(r.pool_member is not None for r in self.replicas)
+        """True for clusters the framework didn't create itself and won't
+        mutate (replica count, drop). Currently set when `pre_existing_name`
+        was passed in. Action classes that would CREATE/ALTER/DROP REPLICA
+        check this and bail."""
+        return self.pre_existing_name is not None
 
     def name(self) -> str:
+        # Pre-existing clusters: name is fixed by the caller (typically a
+        # pool-cluster the Antithesis compose bootstrapped). Don't apply
+        # naughtify / name_scope / rename — they don't apply to objects we
+        # didn't create.
+        if self.pre_existing_name is not None:
+            return self.pre_existing_name
         # Format: `cluster[-{name_scope}]-{cluster_id}[-{rename}]`. The
         # bracketed `-{name_scope}` segment is only present when seed-
         # scoping is on, so the historical `cluster-0` / `cluster-0-1`
@@ -1055,29 +1054,13 @@ def __str__(self) -> str:
         return identifier(self.name())
 
     def create(self, exe: Executor) -> None:
+        # Pre-existing cluster: the SUT already has it (bootstrapped at
+        # compose-up). The framework's only responsibility for the cluster
+        # is to use its name; never DDL it.
+        if self.pre_existing_name is not None:
+            return
         query = f"CREATE CLUSTER {self} "
-        if self.is_pool_backed:
-            # Unmanaged cluster pointing at pre-existing clusterd containers.
-            # Each replica gets the STORAGECTL/STORAGE/COMPUTECTL/COMPUTE
-            # ADDRESSES of its pool member; WORKERS comes from the pool
-            # member's config. Requires
-            # `unsafe_enable_unorchestrated_cluster_replicas = true` on the
-            # SUT (see test/antithesis/mzcompose.py for the Antithesis case).
-            replica_specs = []
-            for replica in self.replicas:
-                assert replica.pool_member is not None
-                m = replica.pool_member
-                replica_specs.append(
-                    f"{replica} ("
-                    f"STORAGECTL ADDRESSES ['{m.host}:{m.storagectl_port}'], "
-                    f"STORAGE ADDRESSES ['{m.host}:{m.storage_port}'], "
-                    f"COMPUTECTL ADDRESSES ['{m.host}:{m.computectl_port}'], "
-                    f"COMPUTE ADDRESSES ['{m.host}:{m.compute_port}'], "
-                    f"WORKERS {m.workers}"
-                    f")"
-                )
-            query += "REPLICAS(" + ", ".join(replica_specs) + ")"
-        elif self.managed:
+        if self.managed:
             query += f"SIZE = '{self.size}', REPLICATION FACTOR = {len(self.replicas)}, INTROSPECTION INTERVAL = '{self.introspection_interval}'"
         else:
             query += "REPLICAS("
@@ -1145,13 +1128,13 @@ def __init__(
         # qualified by DB.name() which includes the seed, so they don't
         # need this.
         seed_scoped_names: bool = False,
-        # When non-None, every cluster the Database creates uses the
-        # external clusterd-pool backend (unmanaged-with-explicit-addresses)
-        # rather than the orchestrator. The Database slices this list one
-        # member per replica across its clusters at construction time.
-        # See `ClusterdPoolMember` for the shape; sized to fit the
-        # database's initial cluster + replica plan.
-        pool_members: list[ClusterdPoolMember] | None = None,
+        # When set, the Database runs against a pre-existing cluster the
+        # framework didn't create and won't drop. CreateClusterAction is
+        # disabled in this mode; the single initial cluster wraps the
+        # supplied name. Used by the Antithesis parallel-driver to bind
+        # each invocation to one of the long-lived pool clusters that the
+        # compose creates at bootstrap (see test/antithesis/mzcompose.py).
+        existing_cluster_name: str | None = None,
     ):
         self.host = host
         self.ports = ports
@@ -1159,7 +1142,7 @@ def __init__(
         self.scenario = scenario
         self.seed = seed
         self.seed_scoped_names = seed_scoped_names
-        self.pool_members = pool_members
+        self.existing_cluster_name = existing_cluster_name
         # The bare seed (no leading/trailing punctuation) used by Cluster /
         # Role / etc. to assemble their scoped names. Empty when seed-scoping
         # is off, in which case those classes fall back to their historical
@@ -1205,24 +1188,24 @@ def __init__(
         ]
         self.role_id = len(self.roles)
         # At least one storage cluster required for WebhookSources.
-        # In pool mode, the entire `pool_members` list is consumed by a
-        # single unmanaged cluster — one replica per member — so the
-        # caller controls both replica count and pool-member identity.
-        # This is the only initial cluster; CreateClusterAction is
-        # disabled in pool mode (no in-band allocator).
-        if pool_members is not None:
+        # In existing-cluster mode the framework's sole initial cluster
+        # wraps a pre-existing cluster (typically a pool cluster the
+        # Antithesis compose bootstrapped). The wrapper's create()/drop()
+        # are no-ops; CreateClusterAction / CreateClusterReplicaAction /
+        # DropClusterReplicaAction are also disabled for it.
+        if existing_cluster_name is not None:
             self.clusters = [
                 Cluster(
                     0,
-                    # managed/size are ignored when pool-backed but kept as
-                    # placeholder values for any code that reads them
-                    # without consulting `is_pool_backed`.
+                    # managed / size / replication_factor are ignored when
+                    # `pre_existing_name` is set — the wrapper never emits
+                    # CREATE CLUSTER.
                     managed=False,
-                    size=pool_members[0].host,
-                    replication_factor=len(pool_members),
+                    size="",
+                    replication_factor=1,
                     introspection_interval="1s",
                     name_scope=self.name_scope,
-                    pool_members=pool_members,
+                    pre_existing_name=existing_cluster_name,
                 )
             ]
         else:
diff --git a/test/antithesis/config/docker-compose.yaml b/test/antithesis/config/docker-compose.yaml
index 900b586870e75..e1439653361bb 100644
--- a/test/antithesis/config/docker-compose.yaml
+++ b/test/antithesis/config/docker-compose.yaml
@@ -828,6 +828,8 @@ services:
     - KAFKA_BROKER=kafka:9092
     - SCHEMA_REGISTRY_URL=http://schema-registry:8081
     - MZ_ANTITHESIS_CLUSTER=antithesis_cluster
+    - ANTITHESIS_CLUSTERD_POOL_SIZE=8
+    - CLUSTERD_POOL_SIZE=8
     - MYSQL_HOST=mysql
     - MYSQL_REPLICA_HOST=mysql-replica
     - MYSQL_PASSWORD=p@ssw0rd
diff --git a/test/antithesis/mzcompose.py b/test/antithesis/mzcompose.py
index d66c63eb3348f..2a75dce53b4e0 100644
--- a/test/antithesis/mzcompose.py
+++ b/test/antithesis/mzcompose.py
@@ -88,6 +88,12 @@ def __init__(self) -> None:
                 # Name of the unmanaged cluster the workload-entrypoint
                 # provisions against clusterd1 before emitting setup-complete.
                 "MZ_ANTITHESIS_CLUSTER=antithesis_cluster",
+                # Pool size for the long-lived `pool_cluster_{i}` clusters
+                # the entrypoint bootstraps. Mirrored to the parallel-
+                # workload driver (CLUSTERD_POOL_SIZE) so they agree on the
+                # slot count.
+                f"ANTITHESIS_CLUSTERD_POOL_SIZE={CLUSTERD_POOL_SIZE}",
+                f"CLUSTERD_POOL_SIZE={CLUSTERD_POOL_SIZE}",
                 # MySQL primary and replica connection details.
                 "MYSQL_HOST=mysql",
                 "MYSQL_REPLICA_HOST=mysql-replica",
diff --git a/test/antithesis/scratchbook/parallel-workload-fault-isolation.md b/test/antithesis/scratchbook/parallel-workload-fault-isolation.md
index f9e182c043ab5..884a030b704db 100644
--- a/test/antithesis/scratchbook/parallel-workload-fault-isolation.md
+++ b/test/antithesis/scratchbook/parallel-workload-fault-isolation.md
@@ -27,12 +27,19 @@ environmentd's process orchestrator and the fault domain disappears.
 ## Solution
 
 A pool of identical pre-deployed clusterd containers
-(`clusterd-pool-{0..N-1}`). Each invocation claims up to
-`PW_DESIRED_REPLICAS` (default 2) slots via filesystem locking and
-provisions a single unmanaged cluster with one replica per claimed
-slot, then releases the locks on exit. Best-effort: with N slots
-claimed the cluster runs as an N-replica cluster (1 ≤ N ≤ desired);
-no slots → exit cleanly.
+(`clusterd-pool-{0..N-1}`) with a corresponding pool of long-lived
+unmanaged clusters (`pool_cluster_{0..N-1}`), each bound to its slot's
+clusterd. Pool clusters are bootstrapped once by the workload-entrypoint
+and outlive every individual parallel-workload invocation. Each
+parallel-workload invocation picks a slot at random and runs against
+`pool_cluster_{slot}`. There is no coordination between concurrent
+invocations: every workload object lives in a seed-scoped database
+(`db-pw-{seed}-*`) with seed-scoped roles, so two invocations sharing a
+pool cluster don't collide. Antithesis faults containers, not
+invocations, so the per-container fault domain is preserved either way;
+two invocations witnessing the same fault is a feature (more
+independent reproductions per failure). The pool cluster itself is
+never dropped.
 
 Components, bottom-up:
 
@@ -43,121 +50,123 @@ Components, bottom-up:
     mem_env RocksDB (matches production, no scratch volume to fight over).
     Pool size from env (`ANTITHESIS_CLUSTERD_POOL_SIZE`, default 8).
 
-  - **`parallel_workload.Database(pool_members=...,
+  - **Pool-cluster bootstrap** in
+    `test/antithesis/workload/workload-entrypoint.sh`. After materialized
+    becomes healthy, the script loops over `0..POOL_SIZE-1` and issues
+    `CREATE CLUSTER pool_cluster_{i} REPLICAS (r1 (STORAGECTL ADDRESSES
+    ['clusterd-pool-{i}:2100'], ...))` for each pool member that doesn't
+    already exist. Idempotent across compose-up cycles. Once setup-
+    complete is emitted, every pool cluster is ready for the test
+    composer to start invoking the parallel-workload driver.
+
+  - **`parallel_workload.Database(existing_cluster_name=...,
     seed_scoped_names=True)`**. Opt-in framework mode: when
-    `pool_members` is set, the framework provisions one unmanaged
-    cluster with `len(pool_members)` replicas, each pointed at a pool
-    member via explicit STORAGECTL/STORAGE/COMPUTECTL/COMPUTE ADDRESSES
-    (in place of managed SIZE/REPLICATION FACTOR); the CreateCluster /
-    CreateReplica / DropReplica actions skip pool-backed clusters
-    because there is no in-band allocator. `seed_scoped_names=True`
-    renames `cluster{N}` / `role{N}` to `cluster-{seed}-{N}` /
-    `role-{seed}-{N}` so concurrent invocations don't collide on
-    global names.
-
-  - **`_claim_pool_slots()`** in
-    `test/antithesis/workload/test/parallel_driver_parallel_workload.py`.
-    Contextmanager that holds up to `PW_DESIRED_REPLICAS` exclusive
-    `fcntl.flock`s on `/tmp/clusterd-pool-slots/{i}.lock` for the
-    lifetime of the invocation. Slots are tried in randomized order so
-    allocation is decorrelated from invocation seed. Every claimed lock
-    is released on context exit (normal or exception), so a crashing
-    driver doesn't strand any slot.
+    `existing_cluster_name` is set, the framework's single initial
+    cluster is a wrapper around the pre-existing cluster — `create()`
+    and `drop()` are no-ops, `is_pool_backed` is True (which gates the
+    CreateCluster / CreateReplica / DropReplica actions). `Cluster.name()`
+    returns the literal cluster name supplied by the caller, bypassing
+    the framework's normal `cluster-{seed}-{id}` shape. Roles still get
+    seed-scoped naming (`role-{seed}-{N}`) so concurrent invocations
+    don't collide on those.
+
+  - **Slot pick** in
+    `test/antithesis/workload/test/parallel_driver_parallel_workload.py`:
+    `rng.randrange(CLUSTERD_POOL_SIZE)`. Stateless, no coordination,
+    no failure mode. Concurrent invocations may share a pool cluster
+    (see the no-collision argument above).
 
   - **`_drop_seed_scoped_objects()`** in the same driver, called in
-    `main()`'s `finally`. Drops every cluster / database / role whose
-    name starts with `cluster-{seed}-` / `db-pw-{seed}-` /
-    `role-{seed}-`. The DROP CLUSTER re-arms the clusterd to be
-    claimed by the next invocation through the reconcile path
-    (see below).
-
-## Clusterd reuse correctness
-
-The pool design assumes a DROP CLUSTER followed by a CREATE CLUSTER
-pointed at the same clusterd is a supported transition. It is — this is
-the same reconciliation path that handles environmentd restart. The
-three pieces:
-
-  1. **Transport cancels the prior connection on every new connect.**
-     `src/service/src/transport.rs::serve` drops the old
-     connection-task token and awaits the task before installing a
-     fresh handler from `handler_fn()`. The new `ClusterClient` is a
-     blank-slate wrapper around the same `Arc<Mutex<TimelyContainer>>`.
-
-  2. **The worker `run` loop survives client disconnects.**
-     `src/storage/src/storage_state.rs::Worker::run` is
-     `while let Some((nonce, rx, tx)) = client_rx.blocking_recv() {
-     run_client(rx, tx); }`. When the old `cmd_tx` is dropped (because
-     the cancel above tore down the prior client), `run_client` returns
-     and the outer loop awaits the next `(nonce, rx, tx)` — the new
-     controller's connection. Worker in-memory state stays resident
-     between connections.
-
-  3. **`reconcile()` drops stale state.** The new controller's first
-     batch of commands ending in `InitializationComplete` is processed
-     by `storage_state::reconcile`: it computes `expected_objects` from
-     the new commands, identifies `stale_objects` as anything the
-     worker knows about that the new controller did not ask for, and
-     `drop_collection`s each one — releasing source tokens (which tears
-     down Kafka consumers, persist write handles, upsert RocksDB state),
-     dropping dataflows, clearing reported frontiers.
-
-Collection IDs do not collide across cluster lifetimes because
-Materialize allocates them globally (`u<n>`, `t<n>`), not per cluster.
-
-The one piece intentionally shared across reconnects is the
-`Arc<PersistClientCache>`. It is keyed by URL+credentials, not by
-cluster identity, and reusing it is the standard production behavior
-(avoids reauthenticating to S3 / postgres-metadata on every reconnect).
-
-The same analysis holds for the compute side (`src/compute/src/server.rs`
-uses the same `ClusterSpec` pattern).
+    `main()`'s `finally`. Drops every database and role whose name
+    starts with `db-pw-{seed}-` / `role-{seed}-`. **Pool clusters are
+    NOT dropped** — they're permanent state shared across invocations.
+    The DROP DATABASE CASCADE transitively drops every workload-created
+    table / MV / index / source / sink, which tears down the
+    corresponding dataflows on the bound clusterd container, so the
+    cluster returns to an idle baseline before the next claimant.
+
+## Why pool clusters must be permanent: the clusterd-reuse constraint
+
+The first iteration of this design dropped and recreated the parallel-
+workload cluster on every invocation. That failed on the second
+invocation against the same pool slot with a clusterd halt:
+
+> `WARN ...: halting process: new instance configuration not compatible
+> with existing instance configuration: ... index_logs:
+> {Timely(Operates): IntrospectionSourceIndex(144115188075856897), ...}
+> vs Some(... IntrospectionSourceIndex(144115188075856641), ...)`
+
+The check is `InstanceConfig::compatible_with` in
+`src/compute-client/src/protocol/command.rs`. It compares `LoggingConfig`
+including `index_logs: BTreeMap<LogVariant, IntrospectionSourceIndex>`.
+Those introspection-source-index IDs are per-cluster catalog allocations
+— every CREATE CLUSTER produces a fresh batch. Pointing a *different*
+cluster identity at a clusterd that already saw a prior cluster's
+introspection indexes trips this check and the clusterd halts on the
+first `CreateInstance` command.
+
+Reconcile (`storage_state::reconcile`, `compute::server`) handles the
+case where the *same* cluster reconnects after an environmentd restart:
+the worker drops stale collections, takes the new commands, and resumes.
+But it does not handle the case where a different cluster claims the
+clusterd, because the introspection indexes don't match.
+
+Pinning cluster identity to clusterd identity — one permanent pool
+cluster per pool clusterd container — sidesteps the check entirely. The
+only reconnect events the pool clusterds see across the lifetime of a
+compose are environmentd restarts (and Antithesis-injected pauses /
+restarts of the pool clusterd itself), both of which exercise the same
+cluster identity reconnecting. That's the path reconcile is designed for.
 
 ## Failure modes
 
-  - **All pool slots held.** Driver tags `sometimes(...)` for
-    visibility and exits cleanly. With the default pool size (8) and
-    the test composer's normal concurrency this is not expected to
-    fire, but if it does we'll see it in the run report.
-
-  - **Crash before drop-on-exit runs.** The flock is released
-    automatically when the process dies (kernel-level lock release).
-    The clusterd is left holding stale state until the next claimant
-    reconciles. Catalog leftovers (`cluster-{seed}-*`,
-    `role-{seed}-*`, `db-pw-{seed}-*`) accumulate until the next
-    invocation with the same seed runs its setup sweep — extremely
-    unlikely since seeds are u64-random. The setup sweep is scoped
-    to the current seed only, so it does not clean cross-invocation
-    leftovers. A periodic external cleanup or a startup-time scan
-    against `mz_clusters` / `mz_roles` / `mz_databases` would be
-    needed to close this loop properly. For now the catalog growth
-    is bounded by run length and not currently a problem.
-
-  - **Pool sizing wrong vs concurrency.** If concurrency exceeds pool
-    size, the late arrivals get "no slot" and exit. We do not currently
-    auto-tune; bump `ANTITHESIS_CLUSTERD_POOL_SIZE` if telemetry shows
-    the "no slot available" signal firing.
+  - **Crash before drop-on-exit runs.** The seed-scoped database and
+    roles are left in the catalog until they're explicitly cleaned up.
+    Catalog leftovers do not break correctness (each seed is u64-random,
+    no cross-invocation collisions) but they accumulate. The next
+    invocation that lands on the same pool cluster will inherit MVs /
+    indexes / sources still rendered on the bound clusterd from the
+    crashed invocation, which is more state pressure than a clean
+    handoff. A periodic / startup-time sweep against `mz_databases` /
+    `mz_roles` would close this; deferred until it shows up as a
+    problem.
+
+  - **Pool size much smaller than concurrency.** With C concurrent
+    invocations and N pool slots, ~C/N invocations share each cluster
+    in steady state. That's correctness-preserving but increases
+    per-cluster state pressure linearly with the ratio. Bump
+    `ANTITHESIS_CLUSTERD_POOL_SIZE` if a single pool cluster runs hot.
 
 ## v1 limitations (future work)
 
+  - **Single-replica pool clusters.** Each pool cluster has one replica
+    (one clusterd container per cluster), so parallel-workload
+    invocations don't exercise multi-replica compute/storage paths.
+    Multi-replica coverage stays in `antithesis_cluster`. A future
+    revision could pair clusterd containers into 2-replica pool
+    clusters at the cost of doubling the pool footprint per
+    concurrency unit.
+
   - **No in-band allocator inside the framework.** Worker threads
-    cannot grab additional pool members mid-run, so
+    cannot grab additional pool clusters mid-run, so
     `CreateClusterAction` / `CreateClusterReplicaAction` /
     `DropClusterReplicaAction` are skipped when pool-backed. The
-    framework only ever touches the pre-allocated pool members.
+    framework only ever touches the pre-existing pool cluster.
 
-  - **No global GC of cross-invocation catalog leftovers.** See
-    failure modes above. A first-invocation sweep against
-    `mz_clusters WHERE name LIKE 'cluster-%-%'` minus the current
-    seed would close this; deferred until it becomes a problem.
+  - **State accumulation on pool clusters.** Each pool cluster runs
+    through O(invocations) workload lifecycles over a long Antithesis
+    run. Even with seed-scoped DBs being dropped on exit, every pool
+    cluster's clusterd retains compute-side bookkeeping (catalog
+    state for introspection, peek_stash subscriptions, etc.). The
+    framework relies on `drop_collection` to release dataflow state;
+    if that path ever leaks, the pool cluster's memory footprint will
+    grow over many invocations.
 
 ## Tunables
 
 | Variable | Default | Effect |
 |---|---|---|
-| `ANTITHESIS_CLUSTERD_POOL_SIZE` (compose) | 8 | Number of `clusterd-pool-{i}` containers deployed. |
-| `CLUSTERD_POOL_SIZE` (driver) | 8 | Number of slots the driver will attempt to claim. Must match the compose value. |
-| `CLUSTERD_POOL_SLOT_LOCK_DIR` (driver) | `/tmp/clusterd-pool-slots` | Directory holding the per-slot flock files. |
-| `PW_DESIRED_REPLICAS` (driver) | 2 | Replicas to ask for per invocation's cluster. Best-effort: driver claims up to this many slots and runs with whatever it gets (≥1). |
+| `ANTITHESIS_CLUSTERD_POOL_SIZE` (compose + entrypoint) | 8 | Number of clusterd-pool-<i> containers deployed and matching pool_cluster_<i> clusters bootstrapped. |
+| `CLUSTERD_POOL_SIZE` (driver) | 8 | Number of slots the driver chooses among. Mirrored from compose by mzcompose.py's Workload service so the two agree. |
 | `PW_RUNTIME_S` (driver) | 20 | Per-invocation runtime; bound to keep the fault-injection budget granular. |
 | `PW_THREADS` (driver) | 4 | Worker threads inside one invocation. |
diff --git a/test/antithesis/workload/test/parallel_driver_parallel_workload.py b/test/antithesis/workload/test/parallel_driver_parallel_workload.py
index edbce1b622636..427a0babc0f16 100644
--- a/test/antithesis/workload/test/parallel_driver_parallel_workload.py
+++ b/test/antithesis/workload/test/parallel_driver_parallel_workload.py
@@ -38,8 +38,6 @@
 
 from __future__ import annotations
 
-import contextlib
-import fcntl
 import logging
 import os
 import random
@@ -80,7 +78,6 @@
     MAX_TABLES,
     MAX_VIEWS,
     MAX_WEBHOOK_SOURCES,
-    ClusterdPoolMember,
     Database,
 )
 from materialize.parallel_workload.executor import Executor
@@ -108,38 +105,24 @@
 RUNTIME_S = float(os.environ.get("PW_RUNTIME_S", "20"))
 NUM_THREADS = int(os.environ.get("PW_THREADS", "4"))
 
-# Number of clusterd-pool-{i} containers reserved for the parallel-workload
-# driver. Must match the pool actually deployed in
-# test/antithesis/mzcompose.py (ANTITHESIS_CLUSTERD_POOL_SIZE there →
-# CLUSTERD_POOL_SIZE here). Each parallel-workload invocation claims
-# slots via `fcntl.flock` (see `_claim_pool_slots`); the locks are held
-# for the lifetime of the invocation so concurrent driver processes
-# inside the workload container can't pick the same clusterd.
+# Number of long-lived pool_cluster_<i> clusters the workload-entrypoint
+# bootstrapped, one per clusterd-pool-<i> container. Must match
+# `ANTITHESIS_CLUSTERD_POOL_SIZE` in test/antithesis/mzcompose.py (the
+# Workload service mirrors that value into the workload container's env
+# as both ANTITHESIS_CLUSTERD_POOL_SIZE and CLUSTERD_POOL_SIZE so the
+# bootstrap script and driver agree).
+#
+# Each invocation picks a pool slot at random and runs against the
+# corresponding pool_cluster_<slot>. No coordination between concurrent
+# invocations: two invocations may share a pool cluster — every workload
+# object is in a seed-scoped database (`db-pw-<seed>-*`) with seed-scoped
+# roles, so DDL/DML never collides; the only shared state is the
+# permanent pool cluster, which is purposefully shared. Antithesis still
+# faults one container at a time, so the per-container fault domain is
+# preserved; multiple invocations witnessing the same fault is a
+# feature (more independent reproductions per failure).
 CLUSTERD_POOL_SIZE = int(os.environ.get("CLUSTERD_POOL_SIZE", "8"))
 
-# Workers configured per clusterd-pool-{i} process. Must match the
-# `Clusterd(..., workers=...)` argument in test/antithesis/mzcompose.py
-# or the unmanaged CREATE CLUSTER REPLICA's `WORKERS` count will diverge
-# from what clusterd actually runs.
-CLUSTERD_POOL_WORKERS = 4
-
-# Replicas to ask for per invocation's cluster. Best-effort: the driver
-# claims up to this many pool slots and runs whatever it gets (≥1). With
-# DESIRED_REPLICAS=2 and POOL_SIZE=8 we get multi-replica coverage for
-# the parallel-workload cluster (currently only `antithesis_cluster` is
-# multi-replica) when capacity allows, while degrading gracefully to a
-# single-replica cluster under contention.
-DESIRED_REPLICAS = int(os.environ.get("PW_DESIRED_REPLICAS", "2"))
-
-# Filesystem locks let concurrent parallel-workload invocations claim
-# distinct clusterd-pool members without coordinating through the SUT.
-# All invocations exec inside the single `workload` container so a
-# regular flock on a tmpfs path is sufficient (no cross-container
-# coordination required).
-POOL_SLOT_LOCK_DIR = os.environ.get(
-    "CLUSTERD_POOL_SLOT_LOCK_DIR", "/tmp/clusterd-pool-slots"
-)
-
 
 def _alter_system(cur: psycopg.Cursor[Any], stmt: str) -> None:
     try:
@@ -328,66 +311,21 @@ def _tolerate_setup_race(fn, *args, **kwargs):
         raise
 
 
-@contextlib.contextmanager
-def _claim_pool_slots(rng: random.Random, desired: int):
-    """Hold exclusive `fcntl.flock`s on up to `desired` pool-slot lockfiles
-    for the duration of the `with` block. Yields the list of claimed slot
-    indices (length 0–`desired`); the caller decides what to do with each
-    population (1 = single-replica fallback, ≥2 = multi-replica cluster,
-    0 = no slots available, exit cleanly).
-
-    Slots are tried in randomized order so allocation is decorrelated
-    from invocation seed / wall clock. Every claimed flock is released
-    when the context exits — normally or via exception — so a crashing
-    driver doesn't strand any slot.
-
-    All parallel-workload driver invocations share the workload
-    container's filesystem, so plain flock on a tmpfs path under
-    `POOL_SLOT_LOCK_DIR` is sufficient serialization (no cross-container
-    coordination required).
-    """
-    try:
-        os.makedirs(POOL_SLOT_LOCK_DIR, exist_ok=True)
-    except OSError as exc:
-        LOG.warning("pool slot lock dir %s unavailable: %s", POOL_SLOT_LOCK_DIR, exc)
-        yield []
-        return
-
-    slots = list(range(CLUSTERD_POOL_SIZE))
-    rng.shuffle(slots)
-    held: list[tuple[int, int]] = []  # (slot, fd)
-    try:
-        for slot in slots:
-            if len(held) >= desired:
-                break
-            path = os.path.join(POOL_SLOT_LOCK_DIR, f"{slot}.lock")
-            fd = os.open(path, os.O_RDWR | os.O_CREAT, 0o600)
-            try:
-                fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
-            except OSError:
-                os.close(fd)
-                continue
-            held.append((slot, fd))
-        yield [slot for slot, _ in held]
-    finally:
-        for _, fd in held:
-            try:
-                fcntl.flock(fd, fcntl.LOCK_UN)
-            finally:
-                os.close(fd)
-
-
 def _drop_seed_scoped_objects(seed: str) -> None:
-    """Drop everything this invocation's seed owns: its clusters, roles, and
-    databases. Called from `main()`'s finally so each invocation leaves the
-    catalog clean and frees its pool-slot's clusterd to be claimed by the
-    next driver run (DROP CLUSTER tears down the unmanaged replica → the
-    clusterd's existing controller connection ends → the next CREATE
-    CLUSTER pointed at the same address claims it via fresh reconcile).
+    """Drop everything this invocation's seed owns: its databases and roles.
+
+    Pool clusters are NOT dropped — they're long-lived, bootstrapped by the
+    workload-entrypoint, and shared (one cluster per slot) across many
+    invocations. The DROP DATABASE CASCADE here transitively drops every
+    table / MV / index / source / sink the workload created on the pool
+    cluster during its run, which tears down the corresponding dataflows
+    on the bound clusterd container — so the cluster goes back to an idle
+    baseline before the next invocation claims the same slot.
 
     Errors here are logged and swallowed: leftover objects only cost a bit
-    of catalog footprint until the next invocation's setup sweep picks them
-    up. Don't let a cleanup failure turn into an assertion failure.
+    of catalog footprint until the next invocation with the same seed
+    re-runs (extremely unlikely since seeds are u64-random). Don't let a
+    cleanup failure turn into an assertion failure.
     """
     from pg8000.native import identifier
 
@@ -413,12 +351,6 @@ def _drop(sql: str) -> None:
                 except Exception as exc:  # noqa: BLE001
                     LOG.debug("cleanup tolerated: %s — %s", sql, exc)
 
-            cur.execute(
-                f"SELECT name FROM mz_clusters WHERE name LIKE 'cluster-{seed}-%'".encode()
-            )
-            for row in cur.fetchall():
-                _drop(f"DROP CLUSTER {identifier(row[0])} CASCADE")
-
             cur.execute(
                 f"SELECT name FROM mz_databases WHERE name LIKE 'db-pw-{seed}-%'".encode()
             )
@@ -440,11 +372,10 @@ def _create_database_for_antithesis(database: Database, exe: Executor) -> None:
     sql-server, and an external postgres source — none of those are running
     in this compose.
 
-    Catalog sweeps are scoped to objects this invocation owns: clusters
-    matching `cluster-{seed}-%` and roles matching `role-{seed}-%`. The
-    seed-scoped names are produced by `Database(seed_scoped_names=True)`;
-    cleaning anything broader would delete state belonging to other
-    concurrent invocations sharing the same SUT.
+    Catalog sweeps are scoped to objects this invocation owns: roles
+    matching `role-{seed}-%`. Pool clusters are NOT touched — they're
+    long-lived state shared across many invocations (one cluster per
+    pool slot, bootstrapped by the workload-entrypoint).
 
     The shared connections / secret (`kafka_conn`, `csr_conn`, `aws_conn`,
     `minio`) live outside any seed-scoped database and are required by every
@@ -468,12 +399,6 @@ def _create_database_for_antithesis(database: Database, exe: Executor) -> None:
     # `seed` is the random_u64 the driver minted at the top of main(), so
     # it's already safe to splice into SQL literally. `Executor.execute`
     # takes a query string and doesn't support parameter binding.
-    exe.execute(f"SELECT name FROM mz_clusters WHERE name LIKE 'cluster-{seed}-%'")
-    for row in exe.cur.fetchall():
-        _tolerate_setup_race(
-            exe.execute, f"DROP CLUSTER {identifier(row[0])} CASCADE"
-        )
-
     exe.execute(f"SELECT name FROM mz_roles WHERE name LIKE 'role-{seed}-%'")
     for row in exe.cur.fetchall():
         _tolerate_setup_race(exe.execute, f"DROP ROLE {identifier(row[0])}")
@@ -561,69 +486,53 @@ def main() -> int:
 
     _prepare_system(NUM_THREADS)
 
-    # Claim up to DESIRED_REPLICAS pool slots; the cluster runs with as
-    # many replicas as we got (≥1). Locks are held until main() returns;
-    # if no slot is free we tag a sometimes() and exit cleanly.
+    # Pick a pool slot at random. Each slot maps to a long-lived
+    # pool_cluster_<slot> bootstrapped by the workload-entrypoint, with
+    # one replica on the matching clusterd-pool-<slot> container.
     #
-    # Each replica lands on its own clusterd-pool-{slot} container, so
-    # Antithesis can fault one replica's container without taking the
-    # cluster offline — exercises the same multi-replica recovery paths
-    # `antithesis_cluster` covers, but on the workload-driven cluster.
-    with _claim_pool_slots(rng, DESIRED_REPLICAS) as pool_slots:
-        sometimes(
-            len(pool_slots) > 0,
-            "parallel workload: clusterd pool slots claimed",
-            {"pool_size": CLUSTERD_POOL_SIZE, "claimed": len(pool_slots)},
-        )
-        sometimes(
-            len(pool_slots) >= DESIRED_REPLICAS,
-            "parallel workload: full multi-replica pool claim",
-            {"pool_size": CLUSTERD_POOL_SIZE, "desired": DESIRED_REPLICAS},
-        )
-        if not pool_slots:
-            LOG.info(
-                "parallel-workload exiting cleanly: no pool slots available "
-                "(pool_size=%d)",
-                CLUSTERD_POOL_SIZE,
-            )
-            return 0
-        pool_members = [
-            ClusterdPoolMember(
-                host=f"clusterd-pool-{slot}",
-                workers=CLUSTERD_POOL_WORKERS,
-            )
-            for slot in pool_slots
-        ]
-        LOG.info(
-            "parallel-workload claimed %d pool slot(s): %s",
-            len(pool_slots),
-            ", ".join(m.host for m in pool_members),
-        )
-        return _run_invocation(seed, rng, pool_members)
+    # No coordination with other concurrent driver invocations: all
+    # workload state is in a seed-scoped database, so two invocations
+    # sharing a pool cluster don't collide. Antithesis still faults
+    # containers, not invocations, so the per-container fault domain
+    # is preserved; multiple invocations witnessing the same fault give
+    # us more independent reproductions per failure.
+    #
+    # Keeping the cluster identity per slot is what makes clusterd reuse
+    # safe across invocations (reconnects against the same cluster pass
+    # `InstanceConfig::compatible_with`; reconnects against a *different*
+    # cluster trip clusterd's introspection-index mismatch halt).
+    pool_slot = rng.randrange(CLUSTERD_POOL_SIZE)
+    cluster_name = f"pool_cluster_{pool_slot}"
+    LOG.info(
+        "parallel-workload using pool slot %d → cluster %s",
+        pool_slot,
+        cluster_name,
+    )
+    return _run_invocation(seed, rng, cluster_name)
 
 
 def _run_invocation(
     seed: str,
     rng: random.Random,
-    pool_members: list[ClusterdPoolMember],
+    cluster_name: str,
 ) -> int:
-    """The bulk of `main()` once pool slot(s) have been claimed. Split out
-    so the slot locks stay held across this whole call: they are released
-    when the enclosing `with` block in `main()` exits.
+    """The bulk of `main()` once a pool slot has been claimed. Split out
+    so the slot lock stays held across this whole call: it's released when
+    the enclosing `with` block in `main()` exits.
     """
 
     # `Scenario.Kill` widens `Action.errors_to_ignore` to absorb connection
     # drops, which mirrors what Antithesis container-pauses look like at the
     # client. We never instantiate `KillAction` itself.
     #
-    # `seed_scoped_names=True` keeps cluster/role names from colliding when
-    # concurrent invocations share the SUT — see _SETUP_RACE_PATTERNS for
-    # the fallback when they collide anyway.
+    # `seed_scoped_names=True` keeps role names from colliding when
+    # concurrent invocations share the SUT.
     #
-    # `pool_members=pool_members` makes a single unmanaged cluster with one
-    # replica per member; the framework forces managed=False and emits
-    # unmanaged CREATE CLUSTER with explicit STORAGECTL/COMPUTE ADDRESSES
-    # for each replica.
+    # `existing_cluster_name=cluster_name` makes the Database wrap the
+    # pool cluster bootstrapped at compose-up; the framework's
+    # CreateClusterAction / CreateClusterReplicaAction /
+    # DropClusterReplicaAction are disabled for it and Cluster.create()
+    # / Cluster.drop() are no-ops.
     database = Database(
         rng=rng,
         seed=seed,
@@ -639,7 +548,7 @@ def _run_invocation(
         scenario=Scenario.Kill,
         naughty_identifiers=False,
         seed_scoped_names=True,
-        pool_members=pool_members,
+        existing_cluster_name=cluster_name,
     )
 
     end_time = time.time() + RUNTIME_S
diff --git a/test/antithesis/workload/workload-entrypoint.sh b/test/antithesis/workload/workload-entrypoint.sh
index 1a8aab5234f51..bfef3f12e4c1a 100755
--- a/test/antithesis/workload/workload-entrypoint.sh
+++ b/test/antithesis/workload/workload-entrypoint.sh
@@ -17,6 +17,10 @@ PGUSER="${PGUSER:-materialize}"
 PGPORT_INTERNAL="${PGPORT_INTERNAL:-6877}"
 PGUSER_INTERNAL="${PGUSER_INTERNAL:-mz_system}"
 CLUSTER="${MZ_ANTITHESIS_CLUSTER:-antithesis_cluster}"
+# Number of long-lived pool clusters to bootstrap, each bound to its own
+# clusterd-pool-{i} container. Must match `ANTITHESIS_CLUSTERD_POOL_SIZE`
+# in mzcompose.py and `CLUSTERD_POOL_SIZE` in the parallel-workload driver.
+CLUSTERD_POOL_SIZE="${ANTITHESIS_CLUSTERD_POOL_SIZE:-8}"
 
 # Wait for materialized to be ready.
 echo "Waiting for materialized to become healthy..."
@@ -64,6 +68,42 @@ else
     echo "Cluster '$CLUSTER' already exists; skipping provisioning."
 fi
 
+# Bootstrap a long-lived `pool_cluster_{i}` for each clusterd-pool-{i}
+# container. Each pool cluster has exactly one replica wired to its
+# matching pool clusterd. Parallel-workload driver invocations claim a
+# slot (via fcntl.flock on the workload container's filesystem) and run
+# against `pool_cluster_{slot}` for their entire lifetime. The cluster
+# identity is tied to the clusterd identity, so reconnects don't trip
+# clusterd's `instance configuration not compatible` halt; only the
+# seed-scoped database / roles get dropped between invocations.
+#
+# Idempotent: skip pool clusters that already exist (the SUT's catalog
+# survives across `docker compose up` if metadata volumes aren't wiped).
+for i in $(seq 0 $((CLUSTERD_POOL_SIZE - 1))); do
+    POOL_CLUSTER="pool_cluster_$i"
+    existing_pool=$(
+        psql -h "$PGHOST" -p "$PGPORT_INTERNAL" -U "$PGUSER_INTERNAL" -tAc \
+            "SELECT 1 FROM mz_clusters WHERE name = '$POOL_CLUSTER'"
+    )
+    if [[ -n "$existing_pool" ]]; then
+        echo "Pool cluster '$POOL_CLUSTER' already exists; skipping provisioning."
+        continue
+    fi
+    echo "Provisioning pool cluster '$POOL_CLUSTER' on clusterd-pool-$i..."
+    psql -h "$PGHOST" -p "$PGPORT_INTERNAL" -U "$PGUSER_INTERNAL" <<SQL
+CREATE CLUSTER ${POOL_CLUSTER} REPLICAS (
+    r1 (
+        STORAGECTL ADDRESSES ['clusterd-pool-${i}:2100'],
+        STORAGE ADDRESSES ['clusterd-pool-${i}:2103'],
+        COMPUTECTL ADDRESSES ['clusterd-pool-${i}:2101'],
+        COMPUTE ADDRESSES ['clusterd-pool-${i}:2102'],
+        WORKERS 4
+    )
+);
+GRANT ALL ON CLUSTER ${POOL_CLUSTER} TO ${PGUSER};
+SQL
+done
+
 # Emit setup_complete — Antithesis begins test commands after this.
 /usr/local/bin/setup-complete.sh
 

From 820f76adad14398da0621df5e504ed9c4d5e7e9c Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Thu, 14 May 2026 10:57:43 -0400
Subject: [PATCH 49/65] test/antithesis: add upsert-ancient-key-writable
 cross-invocation property
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Sibling property upsert-key-reflects-latest-value tests only freshly-
written keys within one invocation. This new property tests the
complementary case: a key that has been resident in the upsert source
for a long time (many invocations, many faults, possibly many clusterd
restarts) must still accept a fresh write and have that write reflected
in the source.

The bug class this catches is a long-resident-state rehydration
regression where the upsert operator's state-store remembers a key's
value with enough fidelity to serve reads but enough wrongness that
fresh writes are silently dropped — the user's pipeline appears stuck
with no error.

Implementation: parallel_driver_upsert_ancient_key_writable.py owns a
dedicated key ring (ancient-k<0..31>) so it never collides with the
sibling driver's per-invocation keys. Each invocation picks 5 ring
slots at random, snapshots their current values, produces fresh
'cross-<prefix>-<nonce>' values, waits for catchup, and asserts that
each key's reflected value changed (or, for first-touch ring slots,
that a row now exists).

The 'always' assertion is race-tolerant against concurrent invocations
of this driver writing to the same ring slot — the only forbidden
outcome is 'row still has the exact old value we tried to overwrite,
with no peer interference,' which means our write was silently lost.
A separate 'sometimes' clause records when our specific new value
reached the source as the win-the-race liveness signal.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../properties/upsert-ancient-key-writable.md |  72 +++++
 ...llel_driver_upsert_ancient_key_writable.py | 273 ++++++++++++++++++
 2 files changed, 345 insertions(+)
 create mode 100644 test/antithesis/scratchbook/properties/upsert-ancient-key-writable.md
 create mode 100644 test/antithesis/workload/test/parallel_driver_upsert_ancient_key_writable.py

diff --git a/test/antithesis/scratchbook/properties/upsert-ancient-key-writable.md b/test/antithesis/scratchbook/properties/upsert-ancient-key-writable.md
new file mode 100644
index 0000000000000..c23448a04163b
--- /dev/null
+++ b/test/antithesis/scratchbook/properties/upsert-ancient-key-writable.md
@@ -0,0 +1,72 @@
+# upsert-ancient-key-writable
+
+## Summary
+
+After a key has been resident in an UPSERT-envelope source for a long time (many invocations, many fault windows, many materialized restarts), writing a fresh `(key, value)` to that key must still update the source's view of it.
+
+The bug class this catches: an upsert state-store rehydration regression that "remembers" a key's old value with enough fidelity to serve reads but enough wrongness that fresh writes are silently dropped, leaving the source stuck at a value it shouldn't be at.
+
+## Why this isn't just `upsert-key-reflects-latest-value`
+
+`upsert-key-reflects-latest-value` is verified per invocation: each invocation writes its own short-lived keys, settles, asserts. The keys it touches were freshly created at the start of that invocation. The rehydration / long-resident-state code paths are exercised only incidentally, when an Antithesis fault lands between a produce and a check inside one invocation.
+
+`upsert-state-rehydrates-correctly` covers explicit clusterd-restart rehydration but only over a single invocation's worth of state, and only for read-after-rehydrate, not write-after-rehydrate.
+
+This property covers the gap: long-resident keys plus fresh writes against them.
+
+## Code paths
+
+- `src/storage/src/upsert.rs` — `upsert_classic` operator. The `multi_get` → check `from_time > prior_order` → `multi_put` sequence must work the same whether `multi_get` returns a value the worker freshly observed in this incarnation or one rehydrated from persist state at startup.
+- `src/storage/src/upsert/types.rs` — `StateValue::ensure_decoded` finalizes consolidating state into either a `Value` or a tombstone. If `ensure_decoded` ever yields a stale value mismatching the persist state, this property will surface it after the next write.
+- `src/storage/src/upsert_continual_feedback.rs` — same contract, persist-feedback flavor.
+
+## How to check it
+
+Workload procedure (per invocation):
+1. Pick K=5 keys at random from a fixed ring of N=32 keys owned by this driver: `ancient-k{0..31}`.
+2. For each picked key, `SELECT text FROM source WHERE key = ?` at real-time recency. Record `old_value` (which may be `NULL` if no prior invocation wrote that ring slot yet).
+3. Produce a fresh value `cross-<my_prefix>-<nonce>` to each picked key on Kafka.
+4. Request a quiet period and wait for `offset_committed` to reach the produced max offset.
+5. Re-query each key. Assert:
+   - If `old_value` was present: post-catchup the source's view must NOT equal `old_value`. Race-tolerant against concurrent peers writing or tombstoning the same key — those outcomes also change the value, and only "row still has the exact old value while no one else touched it" indicates our write was silently dropped.
+   - If `old_value` was absent (first-time write to that ring slot): post-catchup a row must exist for the key.
+
+## What goes wrong on violation
+
+A write to a long-resident upsert key is silently dropped: Kafka acked the produce, materialize ingested the message (`offset_committed` advanced), but the upsert state did not update. Read-only paths still return the old value; the user sees their pipeline "stuck" with no error.
+
+## Antithesis angle
+
+The interesting time window is the time between the ring slot's most recent write and the next one. In a long Antithesis run, that window spans many fault injections, clusterd restarts, and materialize-driven rehydrations of the upsert source's persist state. The longer the run, the more genuinely "ancient" the prior value is when we revisit.
+
+Combine with:
+- Node-termination faults — exercises the rehydration path between writes to the same ring slot.
+- Network-partition faults between materialized and clusterd-pool members — exercises feedback-channel recovery.
+- Long-lived runs (multi-hour) — gives time for many ring-slot revisits with intervening faults.
+
+## Dependencies
+
+- The fixed ring `ancient-k{0..31}` is namespaced away from any sibling driver's keys, so this driver doesn't interfere with `upsert-key-reflects-latest-value`'s assertions.
+- Two concurrent invocations of THIS driver picking the same ring slot is the race the `always` assertion is designed to tolerate. The `sometimes` clause "our specific new value reached the source" still fires when one invocation wins.
+
+## Existing instrumentation
+
+None. Candidate SUT anchors: `assert_sometimes!(upsert_long_resident_key_written, ...)` at the `multi_put` site, conditioned on the key's `from_time` being at least N minutes behind wall clock, would confirm the property's specific path is exercised. Deferred.
+
+## Implementation status
+
+Implemented as `test/antithesis/workload/test/parallel_driver_upsert_ancient_key_writable.py`.
+
+| Message | Type | Fires when |
+|---------|------|------------|
+| `"upsert: write to ancient key changes its reflected value"` | `always` | Per ancient ring slot that had a prior value, post-catchup. False ⟺ row still present with the exact pre-write value. |
+| `"upsert: write to previously-empty ancient key creates a row"` | `always` | Per ancient ring slot that was empty before our write, post-catchup. False ⟺ no row exists despite our non-null produce + catchup. |
+| `"upsert: at least one ancient-ring key has a prior value to overwrite"` | `sometimes` | Per invocation. Confirms the property's interesting path (overwrite, not first-touch) is exercised. |
+| `"upsert: source caught up after cross-invocation produces"` | `sometimes` | Per invocation. Liveness for the catchup gate. |
+| `"upsert: cross-invocation driver's own write reached the source"` | `sometimes` | Per invocation. Confirms the full write→catchup→read pipeline works end-to-end at least sometimes (most of the time, under low concurrency). |
+
+Knobs: `ANCIENT_KEY_RING_SIZE=32`, `ANCIENT_KEYS_PER_INVOCATION=5`, `QUIET_PERIOD_S=20`, `CATCHUP_TIMEOUT_S=60.0`.
+
+## Provenance
+
+Surfaced by: Data Integrity (long-lived upsert state correctness).
diff --git a/test/antithesis/workload/test/parallel_driver_upsert_ancient_key_writable.py b/test/antithesis/workload/test/parallel_driver_upsert_ancient_key_writable.py
new file mode 100644
index 0000000000000..296bf115fd425
--- /dev/null
+++ b/test/antithesis/workload/test/parallel_driver_upsert_ancient_key_writable.py
@@ -0,0 +1,273 @@
+#!/usr/bin/env python3
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Antithesis driver for property `upsert-ancient-key-writable`.
+
+The sibling driver `parallel_driver_upsert_latest_value.py` writes keys
+under a fresh per-invocation prefix (`p<u64hex>-k{0..7}`) and only ever
+revisits its own keys *within* an invocation. Each key it writes is
+abandoned at end-of-invocation — the row persists in the upsert source,
+but no future invocation of that driver ever touches it again.
+
+This driver exercises the question: *if I write into a key that has
+been resident in the upsert source for a long time, does the source
+still reflect the write?* A failure here looks like an upsert state-
+rehydration bug surfacing only after enough time / fault-injection has
+elapsed: the source remembers the key from when it was originally
+written, but a fresh write produces no observable change.
+
+To exercise that property without interfering with the sibling driver
+(whose `always("upsert: SELECT for key matches latest produced value",
+…)` assumes nothing concurrent is writing to its prefixed keys), this
+driver owns its own key ring `ancient-k{0..N-1}`. Each invocation picks
+K of them at random, snapshots their current values, produces fresh
+values, waits for catchup, and asserts that the source's view of each
+key changed. Between two invocations that happen to target the same
+ring slot, a lot of wall time and fault injection may elapse — the
+longer the run, the more genuinely "ancient" the snapshotted value is
+at the time of the next overwrite.
+
+Assertion shape, per targeted key:
+  - `always`: post-catchup, the source's view of the key is NOT the
+    old value we snapshotted. The observed value can be (a) our new
+    value, (b) some other invocation's cross-overwrite, or (c) absent
+    (only possible if some concurrent peer tombstoned the key — this
+    driver never tombstones). The one outcome we must never see is
+    "row still present with the exact old value we just overwrote," which
+    means the write was silently dropped while no peer interfered.
+  - `sometimes`: the observed value equals OUR specific new value.
+    Liveness — confirms we sometimes win the race against any concurrent
+    peers and fully exercise the write+read pipeline through to query.
+
+Initial state for each key in the ring: until the first invocation
+writes to ring slot `k`, no row exists for it. We tolerate that as
+old_value=None and only assert post-catchup that the row now exists.
+The first invocation to write each key seeds the property for later
+ones.
+"""
+
+from __future__ import annotations
+
+import logging
+import sys
+
+import helper_random
+from antithesis.assertions import always, sometimes
+from helper_kafka import make_producer
+from helper_pg import query_retry
+from helper_quiet import request_quiet_period
+from helper_source_stats import wait_for_catchup
+from helper_upsert_source import (
+    SOURCE_UPSERT_TEXT,
+    TOPIC_UPSERT_TEXT,
+    ensure_upsert_text_source,
+)
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
+)
+LOG = logging.getLogger("driver.upsert_ancient_key_writable")
+
+# Fixed key ring owned exclusively by this driver. No other driver writes
+# keys matching this prefix, so the property's assertions are race-free
+# against the rest of the workload. The size sets the upper bound on how
+# many distinct "ancient" rows accumulate — small enough that revisits
+# happen often, large enough that two concurrent invocations of this
+# driver usually don't pick the same ring slot.
+ANCIENT_KEY_PREFIX = "ancient-k"
+ANCIENT_KEY_RING_SIZE = 32
+
+# Number of ancient keys to target per invocation. Small on purpose — the
+# Test Composer launches this driver many times, so coverage comes from
+# many short invocations rather than one big one.
+ANCIENT_KEYS_PER_INVOCATION = 5
+
+QUIET_PERIOD_S = 20
+CATCHUP_TIMEOUT_S = 60.0
+
+
+def _produce(producer, tracker, topic: str, key: str, value: str) -> None:
+    producer.produce(
+        topic=topic,
+        key=key.encode("utf-8"),
+        value=value.encode("utf-8"),
+        on_delivery=tracker.callback,
+    )
+
+
+def _snapshot_current_value(key: str) -> tuple[bool, str | None]:
+    """Return (found, value) for `key` at a real-time-recency timestamp.
+    UPSERT contract: at most one row per key.
+    """
+    rows = query_retry(
+        f"SELECT count(*)::bigint, max(text) FROM {SOURCE_UPSERT_TEXT} WHERE key = %s",
+        (key,),
+        real_time_recency=True,
+    )
+    if not rows:
+        return False, None
+    count, value = rows[0]
+    if count == 0:
+        return False, None
+    if count != 1:
+        raise RuntimeError(
+            f"upsert source has {count} rows for key {key!r}; this driver assumes "
+            "the per-key uniqueness property holds"
+        )
+    return True, value
+
+
+def main() -> int:
+    ensure_upsert_text_source()
+
+    # Per-invocation prefix is only used as a nonce-namespace for our
+    # written values, so triage can attribute a `cross-<prefix>-<nonce>`
+    # value back to a specific invocation. The KEYS we target come from
+    # the shared ring, not from any per-invocation prefix.
+    prefix = f"p{helper_random.random_u64():016x}"
+    LOG.info("driver starting; prefix=%s", prefix)
+
+    # Pick K distinct ring slots at random. The helper module doesn't
+    # expose `random_sample`, so we do reservoir-style sampling via
+    # repeated random_choice with removal.
+    candidate_pool = list(range(ANCIENT_KEY_RING_SIZE))
+    slot_indices: list[int] = []
+    for _ in range(min(ANCIENT_KEYS_PER_INVOCATION, len(candidate_pool))):
+        pick = helper_random.random_choice(candidate_pool)
+        candidate_pool.remove(pick)
+        slot_indices.append(pick)
+    keys = [f"{ANCIENT_KEY_PREFIX}{i}" for i in slot_indices]
+
+    # Snapshot each key's current value BEFORE producing. This is the
+    # `old_value` half of the assertion.
+    snapshots: list[tuple[str, bool, str | None]] = []
+    for key in keys:
+        found, value = _snapshot_current_value(key)
+        snapshots.append((key, found, value))
+
+    sometimes(
+        any(found for _, found, _ in snapshots),
+        "upsert: at least one ancient-ring key has a prior value to overwrite",
+        {"keys": [k for k, _, _ in snapshots]},
+    )
+
+    producer, tracker = make_producer(client_id=f"antithesis-ancient-{prefix}")
+
+    # Produce a fresh value to each ring slot. The value embeds our
+    # prefix + a per-write nonce so triage can distinguish "our write
+    # reached the source" from "some concurrent invocation wrote".
+    new_values: list[tuple[str, bool, str | None, str]] = []
+    for key, found, old_value in snapshots:
+        nonce = helper_random.random_u64()
+        new_value = f"cross-{prefix}-{nonce:016x}"
+        new_values.append((key, found, old_value, new_value))
+        _produce(producer, tracker, TOPIC_UPSERT_TEXT, key, new_value)
+        producer.poll(0)
+
+    pending = producer.flush(timeout=30)
+    if pending > 0 or tracker.last_error is not None:
+        # Under sustained fault injection we can't prove which produces
+        # Kafka accepted. Bail before asserting — "writes that landed got
+        # reflected" doesn't apply to writes that didn't land.
+        LOG.info(
+            "skipping assertions: producer.flush pending=%d last_error=%s",
+            pending,
+            tracker.last_error,
+        )
+        return 0
+
+    max_produced = tracker.topic_max_offset(TOPIC_UPSERT_TEXT)
+    if max_produced < 0:
+        LOG.info("no produces confirmed; exiting cleanly")
+        return 0
+
+    request_quiet_period(QUIET_PERIOD_S)
+    caught_up = wait_for_catchup(
+        SOURCE_UPSERT_TEXT, max_produced, timeout_s=CATCHUP_TIMEOUT_S
+    )
+    sometimes(
+        caught_up,
+        "upsert: source caught up after cross-invocation produces",
+        {"source": SOURCE_UPSERT_TEXT, "target_offset": max_produced},
+    )
+    if not caught_up:
+        LOG.info("catchup did not complete in budget; skipping assertions")
+        return 0
+
+    my_value_observed = 0
+
+    for key, was_found, old_value, new_value in new_values:
+        found_after, observed = _snapshot_current_value(key)
+
+        if was_found:
+            # Safety property: writing into a key that had a prior value
+            # must change the source's view. Accepted outcomes:
+            #   * observed == new_value (we won the race)
+            #   * observed == <peer's cross-overwrite> (peer won)
+            #   * not found_after (peer tombstoned — this driver never
+            #     does so it'd have to be a future variant or an
+            #     external producer, but the shape is legitimate)
+            # The one outcome we must NEVER see is `found_after and
+            # observed == old_value`, which means our write was silently
+            # lost while no one else touched the key.
+            violation = found_after and observed == old_value
+            always(
+                not violation,
+                "upsert: write to ancient key changes its reflected value",
+                {
+                    "source": SOURCE_UPSERT_TEXT,
+                    "key": key,
+                    "old_value": old_value,
+                    "new_value_attempted": new_value,
+                    "observed_present": found_after,
+                    "observed_value": observed,
+                },
+            )
+        else:
+            # First-touch path: the ring slot was empty before. After
+            # producing a non-null value, the source must contain a row.
+            # The row's value is either ours or a peer's cross-overwrite;
+            # both are valid. The one outcome we must never see is
+            # `not found_after` — meaning a non-tombstone write to an
+            # empty key produced no row.
+            always(
+                found_after,
+                "upsert: write to previously-empty ancient key creates a row",
+                {
+                    "source": SOURCE_UPSERT_TEXT,
+                    "key": key,
+                    "new_value_attempted": new_value,
+                    "observed_present": found_after,
+                    "observed_value": observed,
+                },
+            )
+
+        if found_after and observed == new_value:
+            my_value_observed += 1
+
+    sometimes(
+        my_value_observed > 0,
+        "upsert: cross-invocation driver's own write reached the source",
+        {
+            "my_value_observed": my_value_observed,
+            "ancient_keys_targeted": len(new_values),
+        },
+    )
+
+    LOG.info(
+        "driver done; ancient_keys=%d my_value_observed=%d",
+        len(new_values),
+        my_value_observed,
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From 891668be02b75834c6c404f8357e44d9f09de284 Mon Sep 17 00:00:00 2001
From: Patrick Butler <patrick.butler@materialize.com>
Date: Thu, 14 May 2026 11:25:29 -0400
Subject: [PATCH 50/65] add assertion for gtid monotonicity violation in mysql

---
 .../source/mysql/replication/partitions.rs    |  10 ++
 ...ysql-source-gtid-monotonicity-violation.md | 117 +++++++++++++++
 .../scratchbook/property-catalog.md           |  15 +-
 .../anytime_mysql_source_no_gtid_errors.py    | 142 ++++++++++++++++++
 4 files changed, 283 insertions(+), 1 deletion(-)
 create mode 100644 test/antithesis/scratchbook/properties/mysql-source-gtid-monotonicity-violation.md
 create mode 100644 test/antithesis/workload/test/anytime_mysql_source_no_gtid_errors.py

diff --git a/src/storage/src/source/mysql/replication/partitions.rs b/src/storage/src/source/mysql/replication/partitions.rs
index c4a6a9ba743bc..7aef48cb3c2c8 100644
--- a/src/storage/src/source/mysql/replication/partitions.rs
+++ b/src/storage/src/source/mysql/replication/partitions.rs
@@ -11,6 +11,8 @@
 
 use std::collections::BTreeMap;
 
+use antithesis_sdk::assert_unreachable;
+use serde_json::json;
 use timely::progress::Antichain;
 use uuid::Uuid;
 
@@ -92,6 +94,14 @@ impl GtidReplicationPartitions {
                 // should only see GTID transaction-ids
                 // in a monotonic order for each source, starting at that upper.
                 if active_part.timestamp() > new_part.timestamp() {
+                    assert_unreachable!(
+                        "mysql: BinlogGtidMonotonicityViolation — received out-of-order GTID from multithreaded replica",
+                        &json!({
+                            "source_uuid": source_id.to_string(),
+                            "active_timestamp": format!("{:?}", active_part.timestamp()),
+                            "new_timestamp": format!("{:?}", new_part.timestamp()),
+                        })
+                    );
                     let err = DefiniteError::BinlogGtidMonotonicityViolation(
                         source_id.to_string(),
                         new_part.timestamp().clone(),
diff --git a/test/antithesis/scratchbook/properties/mysql-source-gtid-monotonicity-violation.md b/test/antithesis/scratchbook/properties/mysql-source-gtid-monotonicity-violation.md
new file mode 100644
index 0000000000000..01e90975c4ba7
--- /dev/null
+++ b/test/antithesis/scratchbook/properties/mysql-source-gtid-monotonicity-violation.md
@@ -0,0 +1,117 @@
+# mysql-source-gtid-monotonicity-violation — MySQL Source Must Not Enter Errored State Due to Out-of-Order GTIDs
+
+## Summary
+
+The Materialize MySQL CDC source must never receive GTIDs out of monotonic
+order from the multithreaded replica. If it does, `BinlogGtidMonotonicityViolation`
+(a `DefiniteError`) permanently errors the source — there is no self-recovery path.
+
+The pipeline is:
+
+```
+MySQL primary (GTID + WRITESET dependency tracking)
+    |
+    +--> MySQL replica (4 parallel workers, replica_preserve_commit_order=ON)
+                |
+         Materialize CDC source (mysql_cdc_source, antithesis_cluster)
+                |
+         antithesis_cdc table
+```
+
+With `replica_preserve_commit_order=ON` the replica guarantees it applies
+transactions in primary-commit order even with 4 concurrent applier threads.
+Under Antithesis fault injection — scheduling jitter, container kills at
+arbitrary points, network delays — this guarantee is stress-tested.
+
+## The Error
+
+`DefiniteError::BinlogGtidMonotonicityViolation` is raised in
+`src/storage/src/source/mysql/replication/partitions.rs:advance_frontier`
+when the per-UUID GTID `active_part.timestamp() > new_part.timestamp()`:
+a new GTID has a lower transaction-id than one the source already processed.
+
+Error message: `"received out of order gtids for source {uuid} at transaction-id {txn}"`
+
+Once emitted, this `DefiniteError` flows to `DataflowError::SourceError` and
+the source is permanently in the "errored" state.  The only recovery is a
+user-initiated `DROP SOURCE` + recreate.
+
+## Instrumentation
+
+**SUT-side** — `src/storage/src/source/mysql/replication/partitions.rs`.
+
+`assert_unreachable!("mysql: BinlogGtidMonotonicityViolation — received out-of-order GTID from multithreaded replica", …)` fires immediately before the `DefiniteError` is returned.  This gives Antithesis a precise, reproducible anchor at the exact site where the violation is detected — before the error propagates and the source enters the errored state.
+
+**Workload-side** — `test/antithesis/workload/test/anytime_mysql_source_no_gtid_errors.py`.
+
+`anytime_` driver polls `mz_internal.mz_source_statuses` every 2 s for the
+`mysql_cdc_source`.  When `status = 'errored'` AND `error` contains
+`"out of order gtids"`, fires:
+
+```python
+always(
+    not is_gtid_error,  # True normally; False triggers the property failure
+    "mysql: source must not enter errored state due to out-of-order GTIDs",
+    {"source": SOURCE_NAME, "status": status, "error": error, …},
+)
+```
+
+The workload-side check is complementary: it observes the effect at the
+user-visible surface, while the SUT-side assertion fires at the exact causal
+site inside the source operator.
+
+## Why This Property Matters
+
+With `replica_preserve_commit_order=ON` enabled, out-of-order GTIDs should
+be impossible.  This property tests whether Antithesis can find a schedule
+(crash timing, worker scheduling delay, partial replica state) under which
+the commit-order guarantee breaks down.  A violation surfaces as:
+
+1. The SUT-side `assert_unreachable!` firing (gives Antithesis a replay anchor).
+2. The source permanently stuck in "errored" state.
+3. The `mysql-source-no-data-loss` `always()` assertions becoming vacuous
+   (catchup never completes, liveness anchor never fires).
+
+## Assertion Types Chosen
+
+- `Unreachable` (SUT-side): the violation path in `advance_frontier` should
+  never be reached.  `assert_unreachable!` converts the error site into a
+  reportable Antithesis property.
+
+- `always(not is_gtid_error)` (workload-side): the observable effect (source
+  in "errored" state due to this error) must never be true.  `always()` is
+  correct because this is a hard safety invariant — every observation must hold.
+
+## Related Properties
+
+- `mysql-source-no-data-loss` — shares the MySQL CDC pipeline; a GTID
+  ordering violation will also cause the data-loss property assertions to
+  become vacuous (catchup never completes once the source is errored).
+- `storage-command-replay-idempotent` — MySQL CDC resume on clusterd restart
+  also exercises GTID position tracking; a corrupted GTID state after restart
+  could trigger this violation.
+
+## Schema
+
+```sql
+-- MySQL: commit-order-preserving multithreaded replication
+SET GLOBAL replica_parallel_workers = 4;
+SET GLOBAL replica_preserve_commit_order = ON;
+
+-- Materialize CDC source (reads from mysql-replica)
+CREATE SOURCE mysql_cdc_source IN CLUSTER antithesis_cluster
+    FROM MYSQL CONNECTION antithesis_mysql_conn;
+CREATE TABLE antithesis_cdc
+    FROM SOURCE mysql_cdc_source (REFERENCE antithesis.cdc_test);
+```
+
+## SUT Code Path
+
+```
+mysql/replication/partitions.rs :: GtidReplicationPartitions::advance_frontier
+  -> active_part.timestamp() > new_part.timestamp()
+  -> assert_unreachable!("mysql: BinlogGtidMonotonicityViolation …")  ← NEW
+  -> DefiniteError::BinlogGtidMonotonicityViolation(source_id, txn_id)
+  -> ReplicationError::Definite(…)
+  -> source enters "errored" state permanently
+```
diff --git a/test/antithesis/scratchbook/property-catalog.md b/test/antithesis/scratchbook/property-catalog.md
index d4074c3bf7e2e..ec139c3ea4ae9 100644
--- a/test/antithesis/scratchbook/property-catalog.md
+++ b/test/antithesis/scratchbook/property-catalog.md
@@ -1,8 +1,9 @@
 ---
 commit: 007c7af9d9970fb2030c7212368b232e0fbc363e
-updated: 2026-05-12
+updated: 2026-05-14
 ---
 <!-- Category 8 (MySQL CDC) added 2026-05-12: mysql-source-no-data-loss -->
+<!-- 2026-05-14: mysql-source-gtid-monotonicity-violation added to Category 9 -->
 
 # Property Catalog: Materialize
 
@@ -442,6 +443,18 @@ commit-order preservation) to the Antithesis environment.
 | **Antithesis Angle** | Kills to the MySQL replica container (replica restarts from persisted GTID position); kills to the MySQL primary (replica and Materialize source must handle upstream silence gracefully); clusterd restarts (MySQL CDC resume exercises the same `storage-command-replay-idempotent` path as Kafka); parallel worker scheduling jitter that stresses the `replica_preserve_commit_order` protocol. |
 | **Why It Matters** | MySQL CDC is a distinct ingestion code path from Kafka. Wrong behavior here — dropped rows, wrong values after restart, duplicate rows after resume — is not caught by the Kafka-source drivers. |
 
+### mysql-source-gtid-monotonicity-violation — MySQL Source Must Not Error Due to Out-of-Order GTIDs
+
+| | |
+|---|---|
+| **Type** | Safety (Unreachable) |
+| **Priority** | P1 — permanent source error with no self-recovery path; directly testable by Antithesis fault injection against the multithreaded replica's commit-order protocol |
+| **Status** | **Implemented (SUT-side + workload-side)** — `src/storage/src/source/mysql/replication/partitions.rs`: `assert_unreachable!("mysql: BinlogGtidMonotonicityViolation — received out-of-order GTID from multithreaded replica", …)` fires immediately before `DefiniteError::BinlogGtidMonotonicityViolation` is returned in `advance_frontier`, giving Antithesis a precise replay anchor at the exact causal site. Workload-side: `test/antithesis/workload/test/anytime_mysql_source_no_gtid_errors.py` polls `mz_internal.mz_source_statuses` every 2 s and fires `always(not is_gtid_error, "mysql: source must not enter errored state due to out-of-order GTIDs", …)` at the user-visible error surface. |
+| **Property** | The Materialize MySQL CDC source must never receive a GTID with a lower transaction-id than one already observed for the same UUID. With `replica_preserve_commit_order=ON` and 4 parallel replica workers, the commit-order protocol must hold even under Antithesis fault injection. |
+| **Invariant** | `Unreachable`: the `BinlogGtidMonotonicityViolation` error site in `advance_frontier` must never be reached. `Always`: `mz_internal.mz_source_statuses` for the MySQL CDC source must never show `status = 'errored'` with `error` containing `"out of order gtids"`. |
+| **Antithesis Angle** | Scheduling jitter under 4 parallel replica workers; container kills of the replica at arbitrary replication progress points; network delays between primary and replica that could desynchronize the commit-order queue. The property tests whether `replica_preserve_commit_order=ON` holds its guarantee when Antithesis controls the scheduler. |
+| **Why It Matters** | `BinlogGtidMonotonicityViolation` is a `DefiniteError` — the source is permanently stuck with no self-recovery path. It also silently neutralizes the `mysql-source-no-data-loss` liveness assertions (catchup never completes once the source is errored). Surfaced by: MySQL CDC source configuration, multithreaded replication correctness. |
+
 ### offset-known-not-below-committed — Source Statistics Causality
 
 | | |
diff --git a/test/antithesis/workload/test/anytime_mysql_source_no_gtid_errors.py b/test/antithesis/workload/test/anytime_mysql_source_no_gtid_errors.py
new file mode 100644
index 0000000000000..264ad67584f1a
--- /dev/null
+++ b/test/antithesis/workload/test/anytime_mysql_source_no_gtid_errors.py
@@ -0,0 +1,142 @@
+#!/usr/bin/env python3
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Antithesis driver for `mysql-source-gtid-monotonicity-violation`.
+
+The MySQL CDC source must never enter an errored state because of
+"received out of order gtids".  This error (BinlogGtidMonotonicityViolation)
+fires when the multithreaded replica delivers a GTID with a lower
+transaction-id than what was already observed for that UUID — permanently
+erroring the source with no self-recovery path.
+
+With 4 parallel replica workers and `replica_preserve_commit_order=ON`,
+this should be impossible: the commit-order protocol guarantees that GTIDs
+arrive in primary-commit order.  But under Antithesis fault injection
+(scheduling jitter, container kills, network delays) the commit-order
+guarantee could be tested.  This driver records an `always()` failure the
+moment the errored state is observed, giving Antithesis a reportable
+property violation with a deterministic replay anchor.
+
+This is an `anytime_` driver — it runs continuously throughout the timeline
+so faults active during its polling window can be correlated with the first
+observed error.  A bounded run budget (`RUN_BUDGET_S`) prevents one instance
+from pinning resources; Antithesis re-launches it freely.
+
+Error-state detection is workload-observable: `mz_internal.mz_source_statuses`
+reports `status = 'errored'` with `error` containing the error message.  We
+check for the specific substring "out of order gtids" so the assertion is
+tight and won't fire on unrelated source errors.
+
+The complementary SUT-side assertion lives in
+`src/storage/src/source/mysql/replication/partitions.rs`:
+`assert_unreachable!("mysql: BinlogGtidMonotonicityViolation …")`.
+"""
+
+from __future__ import annotations
+
+import logging
+import sys
+import time
+
+from antithesis.assertions import always
+from helper_mysql_source import SOURCE_NAME
+from helper_pg import query_retry
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
+)
+LOG = logging.getLogger("driver.mysql_source_no_gtid_errors")
+
+# Knobs.
+POLL_INTERVAL_S = 2.0
+RUN_BUDGET_S = 60.0
+
+# Substring that identifies the specific error this property targets.
+_GTID_ORDER_ERROR = "out of order gtids"
+
+
+def _source_status() -> tuple[str, str | None] | None:
+    """Query status and error for the MySQL CDC source.
+
+    Returns (status, error_message) or None if the source doesn't exist yet
+    or the query fails (both are expected early in a timeline).
+    """
+    try:
+        rows = query_retry(
+            """
+            SELECT ss.status, ss.error
+            FROM mz_internal.mz_source_statuses ss
+            JOIN mz_sources s ON s.id = ss.id
+            WHERE s.name = %s
+            """,
+            (SOURCE_NAME,),
+        )
+    except Exception as exc:  # noqa: BLE001
+        LOG.info("source status query failed: %s", exc)
+        return None
+    if not rows:
+        return None
+    status, error = rows[0]
+    return (status, error)
+
+
+def main() -> int:
+    deadline = time.monotonic() + RUN_BUDGET_S
+    checks = 0
+
+    while time.monotonic() < deadline:
+        result = _source_status()
+        if result is None:
+            time.sleep(POLL_INTERVAL_S)
+            continue
+
+        status, error = result
+        checks += 1
+
+        is_gtid_error = (
+            status == "errored"
+            and error is not None
+            and _GTID_ORDER_ERROR in error.lower()
+        )
+
+        always(
+            not is_gtid_error,
+            "mysql: source must not enter errored state due to out-of-order GTIDs",
+            {
+                "source": SOURCE_NAME,
+                "status": status,
+                "error": error,
+                "note": (
+                    "BinlogGtidMonotonicityViolation fired — multithreaded replica "
+                    "delivered a GTID with lower txn-id than previously observed; "
+                    "replica_preserve_commit_order protocol violated under fault injection"
+                ),
+            },
+        )
+
+        if is_gtid_error:
+            LOG.error(
+                "gtid monotonicity violation detected: status=%s error=%s",
+                status,
+                error,
+            )
+
+        time.sleep(POLL_INTERVAL_S)
+
+    LOG.info(
+        "mysql-source-no-gtid-errors done; %d status checks over %.0fs",
+        checks,
+        RUN_BUDGET_S,
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From 86d1fbbc058b019dd170263a7e202427250e7023 Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Thu, 14 May 2026 11:47:00 -0400
Subject: [PATCH 51/65] test/antithesis: bump clusterd workers to 16 and shrink
 pool to 2

Workers per clusterd: 4 -> 16. Single-process clusterds at workers=16
exercise the same intra-process concurrency surface as a 4-process
scale=4,workers=4 production deployment, giving us realistic per-shard
parallelism, scheduler contention, and Antithesis-thread-pause-fault
depth.

Pool size: 8 -> 2. The no-lock allocator already tolerates
oversubscription (concurrent invocations may share a pool cluster
because every workload object lives in a seed-scoped database), so a
smaller pool isn't a correctness concern. A pool of 2 keeps the
topology closer to production replica counts and makes each pool
cluster behave more like a busy production cluster.

Workers count is now plumbed through end-to-end:
  * mzcompose.py declares CLUSTERD_WORKERS=16 and uses it for every
    Clusterd(...) service AND exports it in the Workload service env.
  * workload-entrypoint.sh reads CLUSTERD_WORKERS and templates it into
    every CREATE CLUSTER REPLICAS' WORKERS clause (antithesis_cluster
    plus each pool_cluster_<i>). The controller reads WORKERS from this
    clause, not from clusterd's runtime config, so the two must stay
    in lockstep.

Total worker thread count goes from 4*8 + 4*2 = 40 (old: 8 pool + 2
antithesis) to 16*2 + 16*2 = 64 (new: 2 pool + 2 antithesis). Modest
memory increase, big throughput / parallelism gain.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 test/antithesis/config/docker-compose.yaml    | 249 +-----------------
 test/antithesis/mzcompose.py                  |  79 ++++--
 .../parallel-workload-fault-isolation.md      |  30 ++-
 .../test/parallel_driver_parallel_workload.py |   2 +-
 .../workload/workload-entrypoint.sh           |  27 +-
 5 files changed, 99 insertions(+), 288 deletions(-)

diff --git a/test/antithesis/config/docker-compose.yaml b/test/antithesis/config/docker-compose.yaml
index e1439653361bb..8b162e1224a78 100644
--- a/test/antithesis/config/docker-compose.yaml
+++ b/test/antithesis/config/docker-compose.yaml
@@ -258,10 +258,10 @@ services:
     - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879
     - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0
     - CLUSTERD_PROCESS=0
-    - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd1:2102"],
+    - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 16, "process": 0, "addresses": ["clusterd1:2102"],
       "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
       false, "zero_copy_limit": null}'
-    - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd1:2103"],
+    - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 16, "process": 0, "addresses": ["clusterd1:2103"],
       "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
       false, "zero_copy_limit": null}'
     volumes:
@@ -296,10 +296,10 @@ services:
     - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879
     - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0
     - CLUSTERD_PROCESS=0
-    - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd2:2102"],
+    - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 16, "process": 0, "addresses": ["clusterd2:2102"],
       "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
       false, "zero_copy_limit": null}'
-    - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd2:2103"],
+    - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 16, "process": 0, "addresses": ["clusterd2:2103"],
       "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
       false, "zero_copy_limit": null}'
     volumes:
@@ -334,10 +334,10 @@ services:
     - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879
     - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0
     - CLUSTERD_PROCESS=0
-    - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-0:2102"],
+    - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 16, "process": 0, "addresses": ["clusterd-pool-0:2102"],
       "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
       false, "zero_copy_limit": null}'
-    - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-0:2103"],
+    - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 16, "process": 0, "addresses": ["clusterd-pool-0:2103"],
       "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
       false, "zero_copy_limit": null}'
     volumes:
@@ -372,238 +372,10 @@ services:
     - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879
     - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0
     - CLUSTERD_PROCESS=0
-    - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-1:2102"],
+    - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 16, "process": 0, "addresses": ["clusterd-pool-1:2102"],
       "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
       false, "zero_copy_limit": null}'
-    - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-1:2103"],
-      "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
-      false, "zero_copy_limit": null}'
-    volumes:
-    - mzdata:/mzdata
-    - mydata:/var/lib/mysql-files
-    - tmp:/share/tmp
-    - scratch:/scratch
-    restart: 'no'
-    stop_grace_period: 120s
-    platform: linux/amd64
-    image: ${MATERIALIZED_IMAGE}
-  clusterd-pool-2:
-    entrypoint:
-    - tini
-    - --
-    command:
-    - clusterd
-    ports:
-    - 2100
-    - 2101
-    - 6878
-    environment:
-    - CLUSTERD_GRPC_HOST=clusterd-pool-2
-    - CLUSTERD_USE_CTP=true
-    - MZ_SOFT_ASSERTIONS=1
-    - CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2100
-    - CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2101
-    - CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=0.0.0.0:6878
-    - CLUSTERD_SECRETS_READER=local-file
-    - CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=/mzdata/secrets
-    - LD_PRELOAD=libeatmydata.so
-    - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879
-    - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0
-    - CLUSTERD_PROCESS=0
-    - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-2:2102"],
-      "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
-      false, "zero_copy_limit": null}'
-    - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-2:2103"],
-      "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
-      false, "zero_copy_limit": null}'
-    volumes:
-    - mzdata:/mzdata
-    - mydata:/var/lib/mysql-files
-    - tmp:/share/tmp
-    - scratch:/scratch
-    restart: 'no'
-    stop_grace_period: 120s
-    platform: linux/amd64
-    image: ${MATERIALIZED_IMAGE}
-  clusterd-pool-3:
-    entrypoint:
-    - tini
-    - --
-    command:
-    - clusterd
-    ports:
-    - 2100
-    - 2101
-    - 6878
-    environment:
-    - CLUSTERD_GRPC_HOST=clusterd-pool-3
-    - CLUSTERD_USE_CTP=true
-    - MZ_SOFT_ASSERTIONS=1
-    - CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2100
-    - CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2101
-    - CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=0.0.0.0:6878
-    - CLUSTERD_SECRETS_READER=local-file
-    - CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=/mzdata/secrets
-    - LD_PRELOAD=libeatmydata.so
-    - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879
-    - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0
-    - CLUSTERD_PROCESS=0
-    - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-3:2102"],
-      "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
-      false, "zero_copy_limit": null}'
-    - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-3:2103"],
-      "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
-      false, "zero_copy_limit": null}'
-    volumes:
-    - mzdata:/mzdata
-    - mydata:/var/lib/mysql-files
-    - tmp:/share/tmp
-    - scratch:/scratch
-    restart: 'no'
-    stop_grace_period: 120s
-    platform: linux/amd64
-    image: ${MATERIALIZED_IMAGE}
-  clusterd-pool-4:
-    entrypoint:
-    - tini
-    - --
-    command:
-    - clusterd
-    ports:
-    - 2100
-    - 2101
-    - 6878
-    environment:
-    - CLUSTERD_GRPC_HOST=clusterd-pool-4
-    - CLUSTERD_USE_CTP=true
-    - MZ_SOFT_ASSERTIONS=1
-    - CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2100
-    - CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2101
-    - CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=0.0.0.0:6878
-    - CLUSTERD_SECRETS_READER=local-file
-    - CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=/mzdata/secrets
-    - LD_PRELOAD=libeatmydata.so
-    - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879
-    - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0
-    - CLUSTERD_PROCESS=0
-    - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-4:2102"],
-      "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
-      false, "zero_copy_limit": null}'
-    - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-4:2103"],
-      "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
-      false, "zero_copy_limit": null}'
-    volumes:
-    - mzdata:/mzdata
-    - mydata:/var/lib/mysql-files
-    - tmp:/share/tmp
-    - scratch:/scratch
-    restart: 'no'
-    stop_grace_period: 120s
-    platform: linux/amd64
-    image: ${MATERIALIZED_IMAGE}
-  clusterd-pool-5:
-    entrypoint:
-    - tini
-    - --
-    command:
-    - clusterd
-    ports:
-    - 2100
-    - 2101
-    - 6878
-    environment:
-    - CLUSTERD_GRPC_HOST=clusterd-pool-5
-    - CLUSTERD_USE_CTP=true
-    - MZ_SOFT_ASSERTIONS=1
-    - CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2100
-    - CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2101
-    - CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=0.0.0.0:6878
-    - CLUSTERD_SECRETS_READER=local-file
-    - CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=/mzdata/secrets
-    - LD_PRELOAD=libeatmydata.so
-    - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879
-    - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0
-    - CLUSTERD_PROCESS=0
-    - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-5:2102"],
-      "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
-      false, "zero_copy_limit": null}'
-    - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-5:2103"],
-      "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
-      false, "zero_copy_limit": null}'
-    volumes:
-    - mzdata:/mzdata
-    - mydata:/var/lib/mysql-files
-    - tmp:/share/tmp
-    - scratch:/scratch
-    restart: 'no'
-    stop_grace_period: 120s
-    platform: linux/amd64
-    image: ${MATERIALIZED_IMAGE}
-  clusterd-pool-6:
-    entrypoint:
-    - tini
-    - --
-    command:
-    - clusterd
-    ports:
-    - 2100
-    - 2101
-    - 6878
-    environment:
-    - CLUSTERD_GRPC_HOST=clusterd-pool-6
-    - CLUSTERD_USE_CTP=true
-    - MZ_SOFT_ASSERTIONS=1
-    - CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2100
-    - CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2101
-    - CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=0.0.0.0:6878
-    - CLUSTERD_SECRETS_READER=local-file
-    - CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=/mzdata/secrets
-    - LD_PRELOAD=libeatmydata.so
-    - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879
-    - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0
-    - CLUSTERD_PROCESS=0
-    - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-6:2102"],
-      "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
-      false, "zero_copy_limit": null}'
-    - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-6:2103"],
-      "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
-      false, "zero_copy_limit": null}'
-    volumes:
-    - mzdata:/mzdata
-    - mydata:/var/lib/mysql-files
-    - tmp:/share/tmp
-    - scratch:/scratch
-    restart: 'no'
-    stop_grace_period: 120s
-    platform: linux/amd64
-    image: ${MATERIALIZED_IMAGE}
-  clusterd-pool-7:
-    entrypoint:
-    - tini
-    - --
-    command:
-    - clusterd
-    ports:
-    - 2100
-    - 2101
-    - 6878
-    environment:
-    - CLUSTERD_GRPC_HOST=clusterd-pool-7
-    - CLUSTERD_USE_CTP=true
-    - MZ_SOFT_ASSERTIONS=1
-    - CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2100
-    - CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2101
-    - CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=0.0.0.0:6878
-    - CLUSTERD_SECRETS_READER=local-file
-    - CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=/mzdata/secrets
-    - LD_PRELOAD=libeatmydata.so
-    - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879
-    - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0
-    - CLUSTERD_PROCESS=0
-    - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-7:2102"],
-      "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
-      false, "zero_copy_limit": null}'
-    - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-7:2103"],
+    - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 16, "process": 0, "addresses": ["clusterd-pool-1:2103"],
       "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
       false, "zero_copy_limit": null}'
     volumes:
@@ -828,8 +600,9 @@ services:
     - KAFKA_BROKER=kafka:9092
     - SCHEMA_REGISTRY_URL=http://schema-registry:8081
     - MZ_ANTITHESIS_CLUSTER=antithesis_cluster
-    - ANTITHESIS_CLUSTERD_POOL_SIZE=8
-    - CLUSTERD_POOL_SIZE=8
+    - ANTITHESIS_CLUSTERD_POOL_SIZE=2
+    - CLUSTERD_POOL_SIZE=2
+    - CLUSTERD_WORKERS=16
     - MYSQL_HOST=mysql
     - MYSQL_REPLICA_HOST=mysql-replica
     - MYSQL_PASSWORD=p@ssw0rd
diff --git a/test/antithesis/mzcompose.py b/test/antithesis/mzcompose.py
index 2a75dce53b4e0..bfbc1abd6d4ca 100644
--- a/test/antithesis/mzcompose.py
+++ b/test/antithesis/mzcompose.py
@@ -54,11 +54,25 @@
 # Number of pool clusterd containers reserved for parallel-workload clusters
 # (one container per cluster, giving each its own container-level fault
 # domain). Read from the env so CI/local runs can tune it without editing
-# this file. Default 8 — enough for ~8 concurrent parallel-driver
-# invocations under the v1 "one cluster per invocation, replication
-# factor 1" allocation, see test/antithesis/workload/test/
-# parallel_driver_parallel_workload.py.
-CLUSTERD_POOL_SIZE = int(os.environ.get("ANTITHESIS_CLUSTERD_POOL_SIZE", "8"))
+# this file. Default 2 — the no-lock allocator (rng-picked slot per
+# invocation) tolerates oversubscription, and a smaller pool keeps the
+# topology closer to production replica counts.
+CLUSTERD_POOL_SIZE = int(os.environ.get("ANTITHESIS_CLUSTERD_POOL_SIZE", "2"))
+
+# Timely worker threads per clusterd process. Bumped to 16 to match the
+# per-process worker density of larger production cluster sizes — single-
+# process clusterds at workers=16 cover the same intra-process
+# concurrency surface as a 4-process scale=4,workers=4 production
+# deployment, so we exercise per-shard parallelism, scheduler contention,
+# and the Antithesis thread-pause fault target with realistic depth.
+#
+# This value must stay in lockstep with the `WORKERS N` clause in every
+# CREATE CLUSTER REPLICAS statement that targets these containers
+# (workload-entrypoint.sh reads it from the CLUSTERD_WORKERS env var
+# the Workload service passes through; the parallel-workload Python
+# driver consumes the same env via the framework's pool-cluster
+# wrapper).
+CLUSTERD_WORKERS = 16
 
 
 class Workload(Service):
@@ -94,6 +108,13 @@ def __init__(self) -> None:
                 # slot count.
                 f"ANTITHESIS_CLUSTERD_POOL_SIZE={CLUSTERD_POOL_SIZE}",
                 f"CLUSTERD_POOL_SIZE={CLUSTERD_POOL_SIZE}",
+                # Worker count for the WORKERS clause in every CREATE
+                # CLUSTER REPLICAS that targets a clusterd-pool or
+                # clusterd1/2 container. Must match the `workers=`
+                # argument passed to each `Clusterd(...)` Service above,
+                # because the controller reads it from this clause not
+                # from clusterd's runtime config.
+                f"CLUSTERD_WORKERS={CLUSTERD_WORKERS}",
                 # MySQL primary and replica connection details.
                 "MYSQL_HOST=mysql",
                 "MYSQL_REPLICA_HOST=mysql-replica",
@@ -145,14 +166,16 @@ def __init__(self) -> None:
     # Antithesis kill either replica's backing container without taking
     # the workload offline.
     #
-    # `workers=4` per clusterd means each replica runs four timely worker
-    # threads in one process. The extra intra-process parallelism is the
-    # surface area Antithesis's thread-pausing fault targets — with a
-    # single worker, "pause one thread" effectively pauses the whole
-    # process, which the container-pause fault already covers. The matching
-    # `WORKERS 4` in the CREATE CLUSTER REPLICAS statement must stay in
-    # lockstep with this value (it's read by the controller, not by
-    # clusterd).
+    # `workers=CLUSTERD_WORKERS` (16) per clusterd means each replica runs
+    # that many timely worker threads in one process. Sized to cover the
+    # per-process worker density of larger production cluster sizes:
+    # single-process clusterds at workers=16 exercise the same
+    # intra-process concurrency surface as a 4-process scale=4,workers=4
+    # production deployment (per-shard parallelism, scheduler contention,
+    # Antithesis thread-pause fault targets). The matching `WORKERS N`
+    # clause in every CREATE CLUSTER REPLICAS statement must equal this
+    # — workload-entrypoint.sh reads CLUSTERD_WORKERS from the env the
+    # Workload service exports.
     #
     # `scratch_directory=None` matches production: cluster replicas in
     # cloud deployments don't get a scratch disk, so the upsert operator's
@@ -165,31 +188,33 @@ def __init__(self) -> None:
     # loops on clusterd1 in an earlier run).
     Clusterd(
         name="clusterd1",
-        workers=4,
+        workers=CLUSTERD_WORKERS,
         scratch_directory=None,
     ),
     Clusterd(
         name="clusterd2",
-        workers=4,
+        workers=CLUSTERD_WORKERS,
         scratch_directory=None,
     ),
     # Pool of identical clusterd containers reserved for the
-    # parallel-workload driver. Each instance is a possible target for
-    # one parallel-workload cluster, giving that cluster its own
-    # container-level fault domain (Antithesis can kill / pause /
-    # partition / throttle a specific pool member without affecting any
-    # other cluster). Same settings as clusterd1/clusterd2: 4 timely
-    # workers per process, no scratch (matches production), restart=no
-    # so Antithesis fault injection isn't fought by docker-compose.
+    # parallel-workload driver. Each instance backs one long-lived
+    # `pool_cluster_<i>` (bootstrapped by workload-entrypoint.sh), giving
+    # that cluster its own container-level fault domain (Antithesis can
+    # kill / pause / partition / throttle a specific pool member without
+    # affecting any other cluster). Same settings as clusterd1/clusterd2:
+    # workers=CLUSTERD_WORKERS, no scratch (matches production),
+    # restart=no so Antithesis fault injection isn't fought by docker-
+    # compose.
     #
-    # Sizing rationale lives in test/antithesis/workload/test/
-    # parallel_driver_parallel_workload.py — the driver maps invocation
-    # seed → pool slot deterministically and assumes the pool is at
-    # least as big as the expected concurrent-invocation count.
+    # Pool sizing rationale lives in test/antithesis/workload/test/
+    # parallel_driver_parallel_workload.py — the driver picks a slot at
+    # random per invocation; with the no-lock allocator, multiple
+    # invocations may share a pool cluster (which is fine because every
+    # workload object lives in a seed-scoped database).
     *[
         Clusterd(
             name=f"clusterd-pool-{i}",
-            workers=4,
+            workers=CLUSTERD_WORKERS,
             scratch_directory=None,
         )
         for i in range(CLUSTERD_POOL_SIZE)
diff --git a/test/antithesis/scratchbook/parallel-workload-fault-isolation.md b/test/antithesis/scratchbook/parallel-workload-fault-isolation.md
index 884a030b704db..3aa97e55078d5 100644
--- a/test/antithesis/scratchbook/parallel-workload-fault-isolation.md
+++ b/test/antithesis/scratchbook/parallel-workload-fault-isolation.md
@@ -43,12 +43,15 @@ never dropped.
 
 Components, bottom-up:
 
-  - **`Clusterd(name="clusterd-pool-{i}", workers=4, scratch_directory=None)`**
-    in `test/antithesis/mzcompose.py`. Same configuration as
-    `clusterd1`/`clusterd2`: four timely workers per process (so
-    Antithesis thread-pause faults have something distinct to pause),
-    mem_env RocksDB (matches production, no scratch volume to fight over).
-    Pool size from env (`ANTITHESIS_CLUSTERD_POOL_SIZE`, default 8).
+  - **`Clusterd(name="clusterd-pool-{i}", workers=CLUSTERD_WORKERS,
+    scratch_directory=None)`** in `test/antithesis/mzcompose.py`. Same
+    configuration as `clusterd1`/`clusterd2`: 16 timely workers per
+    process (matches the per-process worker density of larger
+    production cluster sizes — single-process clusterds at workers=16
+    cover the same intra-process concurrency surface as a 4-process
+    scale=4,workers=4 production deployment), mem_env RocksDB (matches
+    production, no scratch volume to fight over). Pool size from env
+    (`ANTITHESIS_CLUSTERD_POOL_SIZE`, default 2).
 
   - **Pool-cluster bootstrap** in
     `test/antithesis/workload/workload-entrypoint.sh`. After materialized
@@ -132,10 +135,12 @@ cluster identity reconnecting. That's the path reconcile is designed for.
     problem.
 
   - **Pool size much smaller than concurrency.** With C concurrent
-    invocations and N pool slots, ~C/N invocations share each cluster
-    in steady state. That's correctness-preserving but increases
-    per-cluster state pressure linearly with the ratio. Bump
-    `ANTITHESIS_CLUSTERD_POOL_SIZE` if a single pool cluster runs hot.
+    invocations and N pool slots (default N=2), ~C/N invocations share
+    each cluster in steady state. That's correctness-preserving but
+    increases per-cluster state pressure linearly with the ratio. The
+    pool is deliberately small so each pool cluster behaves more like
+    a busy production cluster; bump `ANTITHESIS_CLUSTERD_POOL_SIZE` if
+    a single pool cluster runs hot enough to mask other signals.
 
 ## v1 limitations (future work)
 
@@ -166,7 +171,8 @@ cluster identity reconnecting. That's the path reconcile is designed for.
 
 | Variable | Default | Effect |
 |---|---|---|
-| `ANTITHESIS_CLUSTERD_POOL_SIZE` (compose + entrypoint) | 8 | Number of clusterd-pool-<i> containers deployed and matching pool_cluster_<i> clusters bootstrapped. |
-| `CLUSTERD_POOL_SIZE` (driver) | 8 | Number of slots the driver chooses among. Mirrored from compose by mzcompose.py's Workload service so the two agree. |
+| `ANTITHESIS_CLUSTERD_POOL_SIZE` (compose + entrypoint) | 2 | Number of clusterd-pool-<i> containers deployed and matching pool_cluster_<i> clusters bootstrapped. |
+| `CLUSTERD_POOL_SIZE` (driver) | 2 | Number of slots the driver chooses among. Mirrored from compose by mzcompose.py's Workload service so the two agree. |
+| `CLUSTERD_WORKERS` (compose + entrypoint) | 16 | Timely worker threads per clusterd process. Must match every CREATE CLUSTER REPLICAS' WORKERS clause and every `Clusterd(workers=...)` Service. |
 | `PW_RUNTIME_S` (driver) | 20 | Per-invocation runtime; bound to keep the fault-injection budget granular. |
 | `PW_THREADS` (driver) | 4 | Worker threads inside one invocation. |
diff --git a/test/antithesis/workload/test/parallel_driver_parallel_workload.py b/test/antithesis/workload/test/parallel_driver_parallel_workload.py
index 427a0babc0f16..4f5302c714544 100644
--- a/test/antithesis/workload/test/parallel_driver_parallel_workload.py
+++ b/test/antithesis/workload/test/parallel_driver_parallel_workload.py
@@ -121,7 +121,7 @@
 # faults one container at a time, so the per-container fault domain is
 # preserved; multiple invocations witnessing the same fault is a
 # feature (more independent reproductions per failure).
-CLUSTERD_POOL_SIZE = int(os.environ.get("CLUSTERD_POOL_SIZE", "8"))
+CLUSTERD_POOL_SIZE = int(os.environ.get("CLUSTERD_POOL_SIZE", "2"))
 
 
 def _alter_system(cur: psycopg.Cursor[Any], stmt: str) -> None:
diff --git a/test/antithesis/workload/workload-entrypoint.sh b/test/antithesis/workload/workload-entrypoint.sh
index bfef3f12e4c1a..ba2b95a8c8b2c 100755
--- a/test/antithesis/workload/workload-entrypoint.sh
+++ b/test/antithesis/workload/workload-entrypoint.sh
@@ -20,7 +20,13 @@ CLUSTER="${MZ_ANTITHESIS_CLUSTER:-antithesis_cluster}"
 # Number of long-lived pool clusters to bootstrap, each bound to its own
 # clusterd-pool-{i} container. Must match `ANTITHESIS_CLUSTERD_POOL_SIZE`
 # in mzcompose.py and `CLUSTERD_POOL_SIZE` in the parallel-workload driver.
-CLUSTERD_POOL_SIZE="${ANTITHESIS_CLUSTERD_POOL_SIZE:-8}"
+CLUSTERD_POOL_SIZE="${ANTITHESIS_CLUSTERD_POOL_SIZE:-2}"
+# Timely worker threads per clusterd process — must equal the `workers=`
+# argument every `Clusterd(...)` Service in mzcompose.py passes, because
+# the controller reads worker count from the WORKERS clause we put in
+# CREATE CLUSTER REPLICAS, not from clusterd's runtime config. Plumbed
+# in via the Workload service's environment.
+CLUSTERD_WORKERS="${CLUSTERD_WORKERS:-16}"
 
 # Wait for materialized to be ready.
 echo "Waiting for materialized to become healthy..."
@@ -52,14 +58,14 @@ CREATE CLUSTER ${CLUSTER} REPLICAS (
         STORAGE ADDRESSES ['clusterd1:2103'],
         COMPUTECTL ADDRESSES ['clusterd1:2101'],
         COMPUTE ADDRESSES ['clusterd1:2102'],
-        WORKERS 4
+        WORKERS ${CLUSTERD_WORKERS}
     ),
     replica2 (
         STORAGECTL ADDRESSES ['clusterd2:2100'],
         STORAGE ADDRESSES ['clusterd2:2103'],
         COMPUTECTL ADDRESSES ['clusterd2:2101'],
         COMPUTE ADDRESSES ['clusterd2:2102'],
-        WORKERS 4
+        WORKERS ${CLUSTERD_WORKERS}
     )
 );
 GRANT ALL ON CLUSTER ${CLUSTER} TO ${PGUSER};
@@ -70,12 +76,13 @@ fi
 
 # Bootstrap a long-lived `pool_cluster_{i}` for each clusterd-pool-{i}
 # container. Each pool cluster has exactly one replica wired to its
-# matching pool clusterd. Parallel-workload driver invocations claim a
-# slot (via fcntl.flock on the workload container's filesystem) and run
-# against `pool_cluster_{slot}` for their entire lifetime. The cluster
-# identity is tied to the clusterd identity, so reconnects don't trip
-# clusterd's `instance configuration not compatible` halt; only the
-# seed-scoped database / roles get dropped between invocations.
+# matching pool clusterd. Parallel-workload driver invocations pick a
+# slot at random and run against `pool_cluster_{slot}`; concurrent
+# invocations may share a pool cluster (every workload object is in a
+# seed-scoped database so they don't collide). The cluster identity is
+# tied to the clusterd identity, so reconnects don't trip clusterd's
+# `instance configuration not compatible` halt; only the seed-scoped
+# database / roles get dropped between invocations.
 #
 # Idempotent: skip pool clusters that already exist (the SUT's catalog
 # survives across `docker compose up` if metadata volumes aren't wiped).
@@ -97,7 +104,7 @@ CREATE CLUSTER ${POOL_CLUSTER} REPLICAS (
         STORAGE ADDRESSES ['clusterd-pool-${i}:2103'],
         COMPUTECTL ADDRESSES ['clusterd-pool-${i}:2101'],
         COMPUTE ADDRESSES ['clusterd-pool-${i}:2102'],
-        WORKERS 4
+        WORKERS ${CLUSTERD_WORKERS}
     )
 );
 GRANT ALL ON CLUSTER ${POOL_CLUSTER} TO ${PGUSER};

From d0aa7fbecc1d5805c8aab1b95f2d4b5707e418f4 Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Thu, 14 May 2026 12:10:48 -0400
Subject: [PATCH 52/65] test/antithesis: add MyISAM cdc table to mysql workload
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Covers the non-transactional DML axis of the MySQL CDC source.
Materialize's MySQL source code path is engine-agnostic (binlog ROW
events look the same for MyISAM and InnoDB), so this exists to assert
the engine-agnostic contract holds in practice when the upstream is
MyISAM:

  * BEGIN/COMMIT around MyISAM statements is silently ignored — each
    statement commits immediately with its own GTID.
  * No rollback semantics: a statement killed mid-write leaves whatever
    rows it managed to insert committed.
  * Table-level locking instead of row-level.

A MyISAM-specific regression would surface as the new driver's
'mysql myisam: CDC source row has correct value after catchup' firing
false while the existing InnoDB-backed driver looks healthy.

What landed:
  * first_mysql_replica_setup.py creates antithesis.cdc_test_myisam
    (ENGINE=MyISAM) alongside the existing cdc_test (ENGINE=InnoDB) on
    the primary, and waits for both to replicate to the replica.
  * helper_mysql_source.py exposes MYSQL_TABLE_MYISAM /
    TABLE_NAME_MYISAM constants and an ensure_mysql_cdc_myisam_table()
    helper. ensure_mysql_cdc_source() now creates both subsources off
    the single mysql_cdc_source SOURCE.
  * parallel_driver_mysql_myisam.py mirrors the InnoDB sibling's shape
    against the MyISAM subsource with the 'myi-p<u64hex>' batch prefix
    so the two drivers don't interfere.
  * Property doc scratchbook/properties/mysql-myisam-cdc-no-data-loss.md
    captures the bug class, MyISAM-specific binlog semantics, and the
    assertion list.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../mysql-myisam-cdc-no-data-loss.md          |  82 ++++++
 .../test/first_mysql_replica_setup.py         |  66 +++--
 .../workload/test/helper_mysql_source.py      |  43 ++-
 .../test/parallel_driver_mysql_myisam.py      | 249 ++++++++++++++++++
 4 files changed, 414 insertions(+), 26 deletions(-)
 create mode 100644 test/antithesis/scratchbook/properties/mysql-myisam-cdc-no-data-loss.md
 create mode 100644 test/antithesis/workload/test/parallel_driver_mysql_myisam.py

diff --git a/test/antithesis/scratchbook/properties/mysql-myisam-cdc-no-data-loss.md b/test/antithesis/scratchbook/properties/mysql-myisam-cdc-no-data-loss.md
new file mode 100644
index 0000000000000..5e5b6fd239f0e
--- /dev/null
+++ b/test/antithesis/scratchbook/properties/mysql-myisam-cdc-no-data-loss.md
@@ -0,0 +1,82 @@
+# mysql-myisam-cdc-no-data-loss
+
+## Summary
+
+Every row inserted to a MyISAM table on the MySQL primary must eventually appear, with the correct value, in the Materialize CDC source that reads from the multithreaded replica — under the same guarantees as the InnoDB-backed `mysql-source-no-data-loss`.
+
+This property exists separately from `mysql-source-no-data-loss` to cover the non-transactional DML axis: MyISAM in MySQL has fundamentally different transactional semantics, and Materialize's MySQL source code path doesn't distinguish engines, so we assert the engine-agnostic contract holds in practice.
+
+## Why MyISAM is interesting for CDC testing
+
+MyISAM differs from InnoDB in ways that show up in the binlog event stream:
+
+* **No multi-statement transactions.** BEGIN/COMMIT around MyISAM statements is silently ignored. Every MyISAM statement is its own implicit transaction and gets its own GTID — there is no bundling of multiple statements under one GTID block.
+* **No rollback.** A statement that fails partway through (e.g., a multi-row INSERT killed by a fault between rows 50 and 51) leaves the partial result committed. Whatever rows made it to the engine are durable; nothing rolls back.
+* **Table-level locking instead of row-level.** Concurrent writers serialize rather than abort-and-retry. The binlog sees a strict serial order.
+* **No crash recovery via redo log.** A crash mid-statement on the primary can leave the on-disk MyISAM table inconsistent with the binlog, but for our purposes we run against a healthy primary and read CDC from the replica, so this only matters under specific Antithesis fault-injection patterns.
+
+Materialize's MySQL source decodes binlog events without consulting the upstream engine. ROW-format binlog events look identical for MyISAM and InnoDB. The source's expected contract is "every binlog event is reflected in the materialize table"; we assert that this contract holds when the upstream is MyISAM.
+
+## Code paths
+
+Same as `mysql-source-no-data-loss`:
+- `src/storage/src/source/mysql/replication/partitions.rs` — binlog event decoding, GTID monotonicity check.
+- `src/storage/src/source/mysql/snapshot.rs` — initial snapshot from the replica (uses `LOCK TABLES ... READ` for non-transactional engines).
+- `src/mysql-util` — connection management.
+
+No engine-specific code in the source. That's the property we're verifying.
+
+## How to check it
+
+Workload procedure (per invocation):
+1. Pick a per-invocation `batch_id` prefix (`myi-p<u64hex>`) so concurrent drivers — including the InnoDB sibling — don't collide.
+2. Insert 20 rows into `antithesis.cdc_test_myisam` on the MySQL primary. Each INSERT is its own implicit transaction.
+3. Record the {id → value} map locally.
+4. Request an Antithesis quiet period.
+5. Poll `COUNT(*) FROM antithesis_cdc_myisam WHERE batch_id = ?` until it reaches the inserted count or the budget expires.
+6. For each row, `SELECT value FROM antithesis_cdc_myisam WHERE id = ?` with `real_time_recency=true`. Assert `value` matches the locally-recorded one.
+
+## What goes wrong on violation
+
+Same failure modes as `mysql-source-no-data-loss`: rows missing, rows with wrong values, rows with extra entries. The bug is silent — the workload sees plausible-but-wrong data.
+
+A MyISAM-specific failure mode worth flagging in triage: if the materialize source were to *accidentally* treat MyISAM events differently (e.g., conflate the lack-of-transaction with the lack-of-event), we'd see consistent under-counting on the MyISAM subsource while the InnoDB sibling looks healthy.
+
+## Antithesis angle
+
+The same fault classes that hit `mysql-source-no-data-loss` apply:
+- Mysql primary container pause / restart between insert and binlog flush.
+- Mysql-replica container pause / restart between binlog ingestion and materialize-side consumption.
+- Materialized container pause / restart between CDC ingestion and persist append.
+- Clusterd-pool container pause / restart on the cluster running the MySQL source.
+
+Specifically MyISAM-relevant scenarios:
+- A multi-row INSERT killed mid-statement should leave only the rows that actually committed. The replica's binlog reflects exactly those rows. Materialize must see exactly those rows. (The driver inserts row-by-row in a Python loop so we don't directly exercise "kill a multi-row INSERT," but Antithesis can pause the primary between any two row-INSERTs in the loop, achieving the same shape.)
+- GTID ordering with MyISAM is per-statement: a workload that interleaves MyISAM and InnoDB writes produces an alternating GTID stream. Materialize must honor that ordering. (The InnoDB sibling driver and this driver run as independent parallel-workload invocations, naturally producing interleaved binlog events.)
+
+## Dependencies
+
+- Requires `gtid_mode = ON` and `binlog_format = ROW` on the primary (already set by mzcompose).
+- Requires the MyISAM table on the primary AND on the replica (provisioned by `first_mysql_replica_setup.py`).
+- The Materialize MySQL source must include the MyISAM table as a referenced subsource (`ensure_mysql_cdc_myisam_table()` in `helper_mysql_source.py`).
+
+## Existing instrumentation
+
+None engine-specific. The general `mysql-source-gtid-monotonicity-violation` SUT assertion (introduced 2026-05-14) covers GTID ordering for both engines uniformly.
+
+## Implementation status
+
+Implemented as `test/antithesis/workload/test/parallel_driver_mysql_myisam.py`.
+
+| Message | Type | Fires when |
+|---------|------|------------|
+| `"mysql myisam: CDC source row has correct value after catchup"` | `always` | Per row, after catchup. False ⟺ row missing or value wrong. |
+| `"mysql myisam: CDC source row count matches inserted count after catchup"` | `always` | Per invocation, after catchup. False ⟺ extra or missing rows for this batch. |
+| `"mysql myisam: CDC source caught up to all primary inserts after quiet period"` | `sometimes` | Per invocation. Liveness for the catchup gate. |
+| `"mysql replica: both cdc_test tables replicated from primary within 90s"` | `sometimes` | Per timeline (fires once from `first_mysql_replica_setup`). Confirms replication is flowing for both engines. |
+
+Knobs: `ROWS_PER_INVOCATION=20`, `QUIET_PERIOD_S=25`, `CATCHUP_TIMEOUT_S=90.0`.
+
+## Provenance
+
+Surfaced by: Data Integrity (engine-agnostic CDC contract).
diff --git a/test/antithesis/workload/test/first_mysql_replica_setup.py b/test/antithesis/workload/test/first_mysql_replica_setup.py
index ee603e60e88d6..f697334beb51f 100644
--- a/test/antithesis/workload/test/first_mysql_replica_setup.py
+++ b/test/antithesis/workload/test/first_mysql_replica_setup.py
@@ -40,8 +40,23 @@
 
 
 def setup_primary() -> None:
-    """Create the antithesis schema and cdc_test table on the MySQL primary."""
-    LOG.info("creating antithesis database and cdc_test table on primary")
+    """Create the antithesis schema and both cdc_test tables on the MySQL
+    primary.
+
+    Two tables on different engines so we exercise both the transactional
+    (InnoDB) and non-transactional (MyISAM) DML paths through the binlog
+    and the Materialize MySQL source. MyISAM differences worth noting for
+    triage:
+      * BEGIN/COMMIT around MyISAM statements is silently ignored — each
+        statement commits immediately.
+      * Each MyISAM statement is its own GTID-tagged binlog event (no
+        bundling into a multi-statement transaction).
+      * No rollback semantics: a MyISAM statement that fails partway
+        through leaves whatever rows it managed to write committed.
+      * No ON UPDATE TIMESTAMP support before MySQL 5.6 — we use a
+        simpler schema (no updated_at) on MyISAM to avoid version churn.
+    """
+    LOG.info("creating antithesis database and cdc_test tables on primary")
     helper_mysql.execute_primary("CREATE DATABASE IF NOT EXISTS antithesis")
     helper_mysql.execute_primary(
         """
@@ -51,11 +66,21 @@ def setup_primary() -> None:
             value TEXT NOT NULL,
             updated_at TIMESTAMP(6) DEFAULT CURRENT_TIMESTAMP(6)
                 ON UPDATE CURRENT_TIMESTAMP(6)
-        )
+        ) ENGINE=InnoDB
+        """,
+        database="antithesis",
+    )
+    helper_mysql.execute_primary(
+        """
+        CREATE TABLE IF NOT EXISTS antithesis.cdc_test_myisam (
+            id VARCHAR(64) NOT NULL PRIMARY KEY,
+            batch_id VARCHAR(64) NOT NULL,
+            value TEXT NOT NULL
+        ) ENGINE=MyISAM
         """,
         database="antithesis",
     )
-    LOG.info("antithesis.cdc_test ready on primary")
+    LOG.info("antithesis.cdc_test (InnoDB) and cdc_test_myisam (MyISAM) ready on primary")
 
 
 def configure_replica() -> None:
@@ -94,26 +119,33 @@ def configure_replica() -> None:
     LOG.info("MySQL replica started")
 
 
-def wait_for_replica_table(timeout_s: float = 90.0) -> bool:
-    """Wait until antithesis.cdc_test is visible on the replica.
+def wait_for_replica_tables(timeout_s: float = 90.0) -> bool:
+    """Wait until both antithesis.cdc_test (InnoDB) and cdc_test_myisam
+    (MyISAM) are visible on the replica.
 
-    Returns True when the table appears (replication is flowing), False on
-    timeout.
+    Returns True when both tables appear (replication is flowing across
+    both engines), False on timeout.
     """
     deadline = time.monotonic() + timeout_s
+    needed = {"cdc_test", "cdc_test_myisam"}
     while time.monotonic() < deadline:
         try:
             rows = helper_mysql.query_replica(
-                "SELECT 1 FROM information_schema.tables "
-                "WHERE table_schema = 'antithesis' AND table_name = 'cdc_test'",
+                "SELECT table_name FROM information_schema.tables "
+                "WHERE table_schema = 'antithesis' "
+                "AND table_name IN ('cdc_test', 'cdc_test_myisam')",
             )
-            if rows:
-                LOG.info("antithesis.cdc_test visible on replica — replication flowing")
+            seen = {r[0] for r in rows}
+            if needed.issubset(seen):
+                LOG.info(
+                    "antithesis cdc tables visible on replica — replication flowing (%s)",
+                    sorted(seen),
+                )
                 return True
         except Exception as exc:  # noqa: BLE001
-            LOG.info("waiting for replica table: %s", exc)
+            LOG.info("waiting for replica tables: %s", exc)
         time.sleep(2)
-    LOG.warning("timed out waiting for antithesis.cdc_test on replica")
+    LOG.warning("timed out waiting for antithesis.cdc_test{,_myisam} on replica")
     return False
 
 
@@ -127,10 +159,10 @@ def main() -> int:
     setup_primary()
     configure_replica()
 
-    replica_ready = wait_for_replica_table()
+    replica_ready = wait_for_replica_tables()
     sometimes(
         replica_ready,
-        "mysql replica: antithesis.cdc_test replicated from primary within 90s",
+        "mysql replica: both cdc_test tables replicated from primary within 90s",
         {
             "primary": helper_mysql.MYSQL_HOST,
             "replica": helper_mysql.MYSQL_REPLICA_HOST,
@@ -139,7 +171,7 @@ def main() -> int:
     if not replica_ready:
         # Proceed anyway — replication may catch up before Materialize tries to
         # validate the source, but log a warning so triage can correlate.
-        LOG.warning("replica table not yet visible; proceeding with source creation")
+        LOG.warning("replica tables not yet visible; proceeding with source creation")
 
     ensure_mysql_cdc_source()
 
diff --git a/test/antithesis/workload/test/helper_mysql_source.py b/test/antithesis/workload/test/helper_mysql_source.py
index 34323a846faed..b45af21e5a6e7 100644
--- a/test/antithesis/workload/test/helper_mysql_source.py
+++ b/test/antithesis/workload/test/helper_mysql_source.py
@@ -20,6 +20,14 @@
   - CONNECTION antithesis_mysql_conn  -> mysql-replica
   - SOURCE  mysql_cdc_source          (IN CLUSTER antithesis_cluster)
   - TABLE   antithesis_cdc            (REFERENCE antithesis.cdc_test)
+  - TABLE   antithesis_cdc_myisam     (REFERENCE antithesis.cdc_test_myisam)
+
+The MyISAM-backed reference exercises CDC for non-transactional DML: in
+MySQL, MyISAM statements commit immediately (BEGIN/COMMIT is silently
+ignored), so the binlog sees them as standalone events with their own
+GTIDs rather than bundled inside a transaction. Materialize's source
+code path doesn't distinguish engines, so this is a property check that
+the engine-agnostic behavior actually holds.
 """
 
 from __future__ import annotations
@@ -38,11 +46,13 @@
 
 MYSQL_DATABASE = "antithesis"
 MYSQL_TABLE = "cdc_test"
+MYSQL_TABLE_MYISAM = "cdc_test_myisam"
 
 SECRET_NAME = "antithesis_mysql_password"
 CONNECTION_NAME = "antithesis_mysql_conn"
 SOURCE_NAME = "mysql_cdc_source"
 TABLE_NAME = "antithesis_cdc"
+TABLE_NAME_MYISAM = "antithesis_cdc_myisam"
 
 
 def ensure_mysql_connection() -> None:
@@ -60,30 +70,44 @@ def ensure_mysql_connection() -> None:
     )
 
 
-def ensure_mysql_cdc_table() -> None:
-    """Create the Materialize table from the MySQL CDC source (idempotent)."""
+def _ensure_mysql_cdc_subtable(mz_table: str, upstream_table: str) -> None:
+    """Create one Materialize table that references `upstream_table` in the
+    MySQL CDC source (idempotent). Shared between the InnoDB and MyISAM
+    references; both come from the same source.
+    """
     try:
         execute_retry(
-            f"CREATE TABLE IF NOT EXISTS {TABLE_NAME} "
+            f"CREATE TABLE IF NOT EXISTS {mz_table} "
             f"FROM SOURCE {SOURCE_NAME} "
-            f"(REFERENCE {MYSQL_DATABASE}.{MYSQL_TABLE})"
+            f"(REFERENCE {MYSQL_DATABASE}.{upstream_table})"
         )
     except psycopg.errors.InternalError as exc:
         if "already exists" not in str(exc):
             raise
-        rows = query_retry("SELECT 1 FROM mz_tables WHERE name = %s", (TABLE_NAME,))
+        rows = query_retry("SELECT 1 FROM mz_tables WHERE name = %s", (mz_table,))
         if rows:
-            LOG.info("table %s landed concurrently; tolerating collision", TABLE_NAME)
+            LOG.info("table %s landed concurrently; tolerating collision", mz_table)
             return
         raise
-    LOG.info("mysql cdc table %s ready", TABLE_NAME)
+    LOG.info("mysql cdc table %s ready (upstream=%s)", mz_table, upstream_table)
+
+
+def ensure_mysql_cdc_table() -> None:
+    """Create the InnoDB-backed Materialize table from the source."""
+    _ensure_mysql_cdc_subtable(TABLE_NAME, MYSQL_TABLE)
+
+
+def ensure_mysql_cdc_myisam_table() -> None:
+    """Create the MyISAM-backed Materialize table from the source."""
+    _ensure_mysql_cdc_subtable(TABLE_NAME_MYISAM, MYSQL_TABLE_MYISAM)
 
 
 def ensure_mysql_cdc_source() -> None:
     """Create the full MySQL CDC pipeline in Materialize (idempotent).
 
-    Requires antithesis.cdc_test to already exist on the MySQL replica.
-    Call first_mysql_replica_setup.py before this in any standalone use.
+    Requires antithesis.cdc_test AND antithesis.cdc_test_myisam to already
+    exist on the MySQL replica. Call first_mysql_replica_setup.py before
+    this in any standalone use.
     """
     ensure_mysql_connection()
     create_source_idempotent(
@@ -94,3 +118,4 @@ def ensure_mysql_cdc_source() -> None:
     )
     LOG.info("mysql cdc source %s ready", SOURCE_NAME)
     ensure_mysql_cdc_table()
+    ensure_mysql_cdc_myisam_table()
diff --git a/test/antithesis/workload/test/parallel_driver_mysql_myisam.py b/test/antithesis/workload/test/parallel_driver_mysql_myisam.py
new file mode 100644
index 0000000000000..00542bba536bc
--- /dev/null
+++ b/test/antithesis/workload/test/parallel_driver_mysql_myisam.py
@@ -0,0 +1,249 @@
+#!/usr/bin/env python3
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Antithesis driver for property `mysql-myisam-cdc-no-data-loss`.
+
+Sibling driver `parallel_driver_mysql_cdc.py` exercises the same property
+shape against the InnoDB-backed `antithesis.cdc_test`. This driver
+exercises the non-transactional flavor against the MyISAM-backed
+`antithesis.cdc_test_myisam`. The Materialize MySQL source code path
+doesn't distinguish engines — the binlog/CDC contract is engine-agnostic
+— so this is a check that the engine-agnostic behavior actually holds
+under non-transactional upstream DML.
+
+What's different about MyISAM in the binlog:
+  * BEGIN/COMMIT around MyISAM statements is silently ignored — each
+    statement commits immediately.
+  * Every MyISAM statement gets its own GTID (one transaction per
+    statement, not per BEGIN/COMMIT block).
+  * No rollback semantics: a statement that fails partway through leaves
+    whatever rows it managed to insert committed and visible to the
+    binlog / replica / Materialize source.
+  * No table-locking deadlock recovery: the storage engine takes
+    table-level locks, so concurrent writers serialize rather than
+    abort-and-retry.
+
+These differences shouldn't affect Materialize's view of the data: every
+acknowledged INSERT must appear in the CDC source with the right value.
+That's the property this driver asserts, with the same shape as
+`parallel_driver_mysql_cdc.py`.
+
+Each invocation:
+  1. Checks the MySQL CDC source and the MyISAM reference table exist.
+  2. Picks a per-invocation `batch_id` prefix so concurrent drivers
+     (including the InnoDB sibling) don't collide.
+  3. Inserts ROWS_PER_INVOCATION rows to the MyISAM table on the primary.
+  4. Requests an Antithesis quiet period and polls the Materialize source
+     table until all expected rows appear (or the budget expires).
+  5. Asserts correctness via `always(...)` on count and per-row values.
+"""
+
+from __future__ import annotations
+
+import logging
+import sys
+import time
+
+import helper_mysql
+import helper_random
+from antithesis.assertions import always, sometimes
+from helper_mysql_source import (
+    MYSQL_DATABASE,
+    MYSQL_TABLE_MYISAM,
+    SOURCE_NAME,
+    TABLE_NAME_MYISAM,
+)
+from helper_pg import query_retry
+from helper_quiet import request_quiet_period
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
+)
+LOG = logging.getLogger("driver.mysql_myisam")
+
+ROWS_PER_INVOCATION = 20
+QUIET_PERIOD_S = 25
+CATCHUP_TIMEOUT_S = 90.0
+POLL_INTERVAL_S = 1.0
+
+
+def _source_ready() -> bool:
+    """Source + MyISAM reference table both exist in Materialize."""
+    src = query_retry("SELECT 1 FROM mz_sources WHERE name = %s", (SOURCE_NAME,))
+    tbl = query_retry("SELECT 1 FROM mz_tables WHERE name = %s", (TABLE_NAME_MYISAM,))
+    return bool(src) and bool(tbl)
+
+
+def _insert_rows(batch_id: str) -> dict[str, str]:
+    """Insert ROWS_PER_INVOCATION rows to the MyISAM table.
+
+    Each insert is its own implicit transaction (MyISAM ignores BEGIN/
+    COMMIT). Returns {id → value} for every successfully inserted row.
+    Failures are logged and skipped: under fault injection the primary
+    may be unreachable mid-loop, and the property is "every
+    acknowledged INSERT shows up," not "every attempted INSERT shows up."
+    """
+    expected: dict[str, str] = {}
+    for i in range(ROWS_PER_INVOCATION):
+        row_id = f"{batch_id}:{i}"
+        value = f"v{helper_random.random_int(0, 9999):04d}"
+        try:
+            helper_mysql.execute_primary(
+                f"INSERT INTO {MYSQL_DATABASE}.{MYSQL_TABLE_MYISAM} "
+                "(id, batch_id, value) VALUES (%s, %s, %s) "
+                "ON DUPLICATE KEY UPDATE value = VALUES(value), batch_id = VALUES(batch_id)",
+                (row_id, batch_id, value),
+                database=MYSQL_DATABASE,
+            )
+            expected[row_id] = value
+        except Exception as exc:  # noqa: BLE001
+            LOG.info("MyISAM insert failed for row %s: %s; skipping", row_id, exc)
+    return expected
+
+
+def _wait_for_catchup(batch_id: str, expected_count: int) -> bool:
+    """Poll Materialize until all expected rows for `batch_id` appear in
+    the MyISAM-referenced subsource.
+    """
+    deadline = time.monotonic() + CATCHUP_TIMEOUT_S
+    last_seen = -1
+    while time.monotonic() < deadline:
+        try:
+            rows = query_retry(
+                f"SELECT COUNT(*)::bigint FROM {TABLE_NAME_MYISAM} WHERE batch_id = %s",
+                (batch_id,),
+            )
+            count = int(rows[0][0]) if rows and rows[0][0] is not None else 0
+        except Exception as exc:  # noqa: BLE001
+            LOG.info("catchup poll failed: %s; retrying", exc)
+            time.sleep(POLL_INTERVAL_S)
+            continue
+
+        if count != last_seen:
+            LOG.info(
+                "mysql myisam catchup: batch=%s observed=%d target=%d",
+                batch_id,
+                count,
+                expected_count,
+            )
+            last_seen = count
+
+        if count >= expected_count:
+            return True
+        time.sleep(POLL_INTERVAL_S)
+
+    LOG.warning(
+        "mysql myisam catchup timeout: batch=%s last_seen=%d target=%d",
+        batch_id,
+        last_seen,
+        expected_count,
+    )
+    return False
+
+
+def _check_rows(expected: dict[str, str]) -> None:
+    """Assert every expected row has the correct value in the Materialize
+    MyISAM-referenced subsource. Uses real_time_recency so the per-row
+    SELECT chosen-ts waits for the MySQL source's real-time upstream
+    frontier; the count-based catchup above can clear at a chosen-ts that
+    just barely satisfies the COUNT, leaving a per-row SELECT moments
+    later to race.
+    """
+    for row_id, want in expected.items():
+        rows = query_retry(
+            f"SELECT value FROM {TABLE_NAME_MYISAM} WHERE id = %s",
+            (row_id,),
+            real_time_recency=True,
+        )
+        found = bool(rows)
+        observed = rows[0][0] if found else None
+        always(
+            found and observed == want,
+            "mysql myisam: CDC source row has correct value after catchup",
+            {
+                "source": TABLE_NAME_MYISAM,
+                "id": row_id,
+                "expected_value": want,
+                "observed_present": found,
+                "observed_value": observed,
+            },
+        )
+
+
+def main() -> int:
+    if not _source_ready():
+        # first_mysql_replica_setup must run before this driver. Outside
+        # Antithesis (e.g. snouty validate) the source / MyISAM table may
+        # not exist yet — exit cleanly rather than erroring so validate
+        # can still proceed.
+        LOG.warning(
+            "mysql cdc source %s or MyISAM table %s not found; skipping "
+            "(first_mysql_replica_setup must run first)",
+            SOURCE_NAME,
+            TABLE_NAME_MYISAM,
+        )
+        return 0
+
+    batch_id = f"myi-p{helper_random.random_u64():016x}"
+    LOG.info("driver starting; batch_id=%s", batch_id)
+
+    expected = _insert_rows(batch_id)
+    if not expected:
+        LOG.info("no rows inserted successfully this invocation; exiting cleanly")
+        return 0
+
+    LOG.info("inserted %d rows; requesting quiet period", len(expected))
+    request_quiet_period(QUIET_PERIOD_S)
+
+    caught_up = _wait_for_catchup(batch_id, len(expected))
+
+    sometimes(
+        caught_up,
+        "mysql myisam: CDC source caught up to all primary inserts after quiet period",
+        {
+            "source": TABLE_NAME_MYISAM,
+            "batch_id": batch_id,
+            "rows_inserted": len(expected),
+        },
+    )
+
+    if not caught_up:
+        LOG.info("catchup did not complete in budget; skipping per-row assertions")
+        return 0
+
+    _check_rows(expected)
+
+    rows = query_retry(
+        f"SELECT COUNT(*)::bigint FROM {TABLE_NAME_MYISAM} WHERE batch_id = %s",
+        (batch_id,),
+        real_time_recency=True,
+    )
+    count_in_mz = int(rows[0][0]) if rows and rows[0][0] is not None else 0
+    always(
+        count_in_mz == len(expected),
+        "mysql myisam: CDC source row count matches inserted count after catchup",
+        {
+            "source": TABLE_NAME_MYISAM,
+            "batch_id": batch_id,
+            "expected_count": len(expected),
+            "observed_count": count_in_mz,
+        },
+    )
+
+    LOG.info(
+        "driver done; asserted on %d MyISAM rows for batch_id=%s",
+        len(expected),
+        batch_id,
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From 8060eb2b264c89c7c2ce0739c9bbcd84999a77c8 Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Thu, 14 May 2026 12:53:19 -0400
Subject: [PATCH 53/65] test/antithesis: per-service container_name + hostname
 + explicit bridge network
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two changes the Antithesis platform needs from the docker-compose YAML,
applied together at export time so they stay in lockstep:

1. Set `container_name` and `hostname` on every service (matching the
   service key). Per Antithesis docker best practices, triage reports
   attribute log lines and assertions by hostname; without an explicit
   hostname the platform infers one (possibly the container id) that's
   harder to recognize. The workload container is the highest-value
   case but the rule is uniform.

2. Define a named bridge network (`antithesis-net`) at the top level
   and put every service on it. Relying on docker-compose's auto-
   generated `default` network was leaving DNS resolution up to
   whatever the surrounding Antithesis orchestration decided; an
   earlier run on this stack failed with kafka unable to resolve
   `zookeeper` (UnknownHostException) during setup. Antithesis support
   pointed at the network shape as the likely cause and suggested
   declaring it explicitly. Not setting `internal: true` per Antithesis
   docker best practices — that would cut us off from the Antithesis-
   side instrumentation network.

Both transforms live in export-compose.py so they apply uniformly to
every present and future service. Sanity-check that no service key
contains an underscore (RFC-1123); all current keys already use
hyphens.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 test/antithesis/config/docker-compose.yaml | 56 ++++++++++++++++++-
 test/antithesis/export-compose.py          | 65 +++++++++++++++++++++-
 2 files changed, 117 insertions(+), 4 deletions(-)

diff --git a/test/antithesis/config/docker-compose.yaml b/test/antithesis/config/docker-compose.yaml
index 8b162e1224a78..a7033da124574 100644
--- a/test/antithesis/config/docker-compose.yaml
+++ b/test/antithesis/config/docker-compose.yaml
@@ -68,6 +68,10 @@ services:
 
       exec docker-entrypoint.sh "$$@"'
     - --
+    container_name: postgres-metadata
+    hostname: postgres-metadata
+    networks:
+    - antithesis-net
   minio:
     entrypoint:
     - sh
@@ -93,6 +97,10 @@ services:
       start_period: 30s
     platform: linux/amd64
     image: minio/minio:latest
+    container_name: minio
+    hostname: minio
+    networks:
+    - antithesis-net
   zookeeper:
     image: confluentinc/cp-zookeeper:7.9.4
     ports:
@@ -109,6 +117,10 @@ services:
       interval: 1s
       start_period: 120s
     platform: linux/amd64
+    container_name: zookeeper
+    hostname: zookeeper
+    networks:
+    - antithesis-net
   kafka:
     image: confluentinc/cp-kafka:7.9.4
     ports:
@@ -140,13 +152,16 @@ services:
       interval: 1s
       start_period: 120s
     platform: linux/amd64
+    container_name: kafka
+    hostname: kafka
+    networks:
+    - antithesis-net
   schema-registry:
     image: confluentinc/cp-schema-registry:7.9.4
     ports:
     - 8081
     networks:
-      default:
-        aliases: []
+    - antithesis-net
     environment:
     - SCHEMA_REGISTRY_KAFKASTORE_TIMEOUT_MS=10000
     - SCHEMA_REGISTRY_KAFKASTORE_TOPIC_REPLICATION_FACTOR=1
@@ -171,6 +186,8 @@ services:
       interval: 1s
       start_period: 120s
     platform: linux/amd64
+    container_name: schema-registry
+    hostname: schema-registry
   mysql:
     init: true
     ports:
@@ -201,6 +218,10 @@ services:
     - mydata:/var/lib/mysql-files
     image: mysql:9.5.0
     platform: linux/amd64
+    container_name: mysql
+    hostname: mysql
+    networks:
+    - antithesis-net
   mysql-replica:
     init: true
     ports:
@@ -235,6 +256,10 @@ services:
     - mydata:/var/lib/mysql-files
     image: mysql:9.5.0
     platform: linux/amd64
+    container_name: mysql-replica
+    hostname: mysql-replica
+    networks:
+    - antithesis-net
   clusterd1:
     entrypoint:
     - tini
@@ -273,6 +298,10 @@ services:
     stop_grace_period: 120s
     platform: linux/amd64
     image: ${MATERIALIZED_IMAGE}
+    container_name: clusterd1
+    hostname: clusterd1
+    networks:
+    - antithesis-net
   clusterd2:
     entrypoint:
     - tini
@@ -311,6 +340,10 @@ services:
     stop_grace_period: 120s
     platform: linux/amd64
     image: ${MATERIALIZED_IMAGE}
+    container_name: clusterd2
+    hostname: clusterd2
+    networks:
+    - antithesis-net
   clusterd-pool-0:
     entrypoint:
     - tini
@@ -349,6 +382,10 @@ services:
     stop_grace_period: 120s
     platform: linux/amd64
     image: ${MATERIALIZED_IMAGE}
+    container_name: clusterd-pool-0
+    hostname: clusterd-pool-0
+    networks:
+    - antithesis-net
   clusterd-pool-1:
     entrypoint:
     - tini
@@ -387,6 +424,10 @@ services:
     stop_grace_period: 120s
     platform: linux/amd64
     image: ${MATERIALIZED_IMAGE}
+    container_name: clusterd-pool-1
+    hostname: clusterd-pool-1
+    networks:
+    - antithesis-net
   materialized:
     hostname: materialized
     depends_on:
@@ -575,6 +616,9 @@ services:
     stop_grace_period: 120s
     platform: linux/amd64
     image: ${MATERIALIZED_IMAGE}
+    container_name: materialized
+    networks:
+    - antithesis-net
   workload:
     depends_on:
       materialized:
@@ -608,7 +652,13 @@ services:
     - MYSQL_PASSWORD=p@ssw0rd
     platform: linux/amd64
     image: ${ANTITHESIS_WORKLOAD_IMAGE}
-networks: {}
+    container_name: workload
+    hostname: workload
+    networks:
+    - antithesis-net
+networks:
+  antithesis-net:
+    driver: bridge
 volumes:
   mzdata: null
   pgdata: null
diff --git a/test/antithesis/export-compose.py b/test/antithesis/export-compose.py
index a204a76fdbf87..f7155ba31c51d 100644
--- a/test/antithesis/export-compose.py
+++ b/test/antithesis/export-compose.py
@@ -189,6 +189,66 @@ def strip_mzcompose_keys(svc: dict[str, Any]) -> None:
         svc.pop(key, None)
 
 
+# Single user-defined bridge network every service joins. Defining the
+# network explicitly (rather than relying on docker-compose's auto-
+# generated `default`) gives us deterministic container-DNS regardless
+# of how the Antithesis platform's surrounding orchestration parses the
+# compose file. Antithesis support flagged the auto-network as a likely
+# cause of a kafka -> zookeeper UnknownHostException during setup; the
+# fix is to make the network explicit.
+#
+# Must NOT set `internal: true` per Antithesis docker best practices —
+# that would cut us off from the Antithesis-side network used for
+# instrumentation. Plain bridge is the recommended shape.
+ANTITHESIS_NETWORK = "antithesis-net"
+
+
+def assign_network(svc: dict[str, Any]) -> None:
+    """Place the service on the single named bridge network so docker-DNS
+    is deterministic. Overwrites any pre-existing `networks` entry — some
+    upstream Service classes set a vestigial `default: aliases: []` block
+    that we don't want carried through.
+    """
+    svc["networks"] = [ANTITHESIS_NETWORK]
+
+
+def declare_top_level_network(compose: dict[str, Any]) -> None:
+    """Declare the bridge network at the compose top level. Overwrites any
+    pre-existing top-level `networks:` entry (mzcompose currently emits
+    an empty dict).
+    """
+    compose["networks"] = {
+        ANTITHESIS_NETWORK: {"driver": "bridge"},
+    }
+
+
+def set_explicit_names(name: str, svc: dict[str, Any]) -> None:
+    """Set `container_name` and `hostname` to the service key.
+
+    Per Antithesis docker best practices (https://antithesis.com/docs/
+    best_practices/docker_best_practices/), every service should declare
+    its container_name and hostname explicitly and use the same value
+    for both. Triage reports attribute log lines and assertions by
+    `hostname`; if it isn't set, Antithesis falls back to an inferred
+    value (possibly the container id) that's harder to recognize.
+
+    Set here at export time rather than per-service in mzcompose.py so
+    that local mzcompose runs aren't constrained to one global
+    container_name namespace.
+
+    Asserts the service key is DNS-safe (no underscores, RFC-1123).
+    Docker Compose itself rejects underscored service keys, so this is
+    a sanity check, not a transform.
+    """
+    if "_" in name:
+        raise ValueError(
+            f"service {name!r}: underscores in hostnames break DNS resolution "
+            f"under Antithesis (RFC-1123). Rename the service to use hyphens."
+        )
+    svc["container_name"] = name
+    svc["hostname"] = name
+
+
 def register_referenced_named_volumes(compose: dict[str, Any]) -> None:
     """Declare any named volume referenced by a service that isn't already
     declared at the top level. Docker Compose rejects the file otherwise.
@@ -223,7 +283,7 @@ def main() -> None:
     repo = Repository(Path("."), arch=Arch.X86_64, antithesis=True)
     c = Composition(repo, "antithesis", munge_services=False)
 
-    for svc in c.compose["services"].values():
+    for name, svc in c.compose["services"].items():
         svc["platform"] = "linux/amd64"
         if "mzbuild" in svc:
             resolve_mzbuild(svc)
@@ -231,7 +291,10 @@ def main() -> None:
         strip_host_bindmounts(svc)
         strip_incompatible_env(svc)
         strip_mzcompose_keys(svc)
+        set_explicit_names(name, svc)
+        assign_network(svc)
 
+    declare_top_level_network(c.compose)
     register_referenced_named_volumes(c.compose)
 
     sys.stdout.write(HEADER)

From 7d5aa56b5918ef9a1fda9ccfce1f50acb131ec4b Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Thu, 14 May 2026 16:40:03 -0400
Subject: [PATCH 54/65] test/antithesis: gate service_started depends_on on
 healthcheck when available
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Data-driven export-compose transform: for every depends_on entry that
uses `condition: service_started` against a dependency that declares a
`healthcheck`, upgrade the condition to `service_healthy`. Dependencies
without a healthcheck (currently only clusterd) are left as
`service_started` since there's nothing to wait on.

Under the Antithesis platform, `service_started` proved unreliable as a
readiness gate during initial container startup. Docker fires it as
soon as the dependency's container process starts, before the
dependency's DNS entry is reliably resolvable. The previous run on the
fault-isolated topology saw kafka hit
`java.net.UnknownHostException: zookeeper: Name or service not known`
148+ times in a row before its retry loop landed on a successful
lookup, with the same cascade downstream (schema-registry ↔ kafka).
Both containers exited with code 1 from those retries, tripping the
"No unexpected container exits" property.

Upgraded edges:
  kafka            -> zookeeper       (zookeeper:2181 nc healthcheck)
  schema-registry  -> kafka           (kafka:9092 nc healthcheck)
  materialized     -> minio           (minio /minio/health/live curl)
  workload         -> schema-registry (schema-registry curl healthcheck)

Left alone:
  workload -> clusterd{1,2}           (no clusterd healthcheck)

Gating on the healthcheck (which probes the actual listen port)
eliminates the DNS-race shape because docker won't fire
`service_healthy` until the dependency is answering on its port —
and DNS is reliably resolvable by then.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 test/antithesis/config/docker-compose.yaml |  8 ++---
 test/antithesis/export-compose.py          | 37 ++++++++++++++++++++++
 2 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/test/antithesis/config/docker-compose.yaml b/test/antithesis/config/docker-compose.yaml
index a7033da124574..8201cb9a11e7f 100644
--- a/test/antithesis/config/docker-compose.yaml
+++ b/test/antithesis/config/docker-compose.yaml
@@ -141,7 +141,7 @@ services:
     - KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR=1
     depends_on:
       zookeeper:
-        condition: service_started
+        condition: service_healthy
     healthcheck:
       test:
       - CMD
@@ -175,7 +175,7 @@ services:
       && exec /etc/confluent/docker/launch
     depends_on:
       kafka:
-        condition: service_started
+        condition: service_healthy
     healthcheck:
       test:
       - CMD
@@ -432,7 +432,7 @@ services:
     hostname: materialized
     depends_on:
       minio:
-        condition: service_started
+        condition: service_healthy
       postgres-metadata:
         condition: service_healthy
     command:
@@ -630,7 +630,7 @@ services:
       kafka:
         condition: service_healthy
       schema-registry:
-        condition: service_started
+        condition: service_healthy
       mysql:
         condition: service_healthy
       mysql-replica:
diff --git a/test/antithesis/export-compose.py b/test/antithesis/export-compose.py
index f7155ba31c51d..b1921f10474be 100644
--- a/test/antithesis/export-compose.py
+++ b/test/antithesis/export-compose.py
@@ -249,6 +249,42 @@ def set_explicit_names(name: str, svc: dict[str, Any]) -> None:
     svc["hostname"] = name
 
 
+def upgrade_started_to_healthy(compose: dict[str, Any]) -> None:
+    """For every `depends_on` entry that uses `condition: service_started`
+    against a dependency that declares a `healthcheck`, upgrade the
+    condition to `service_healthy`.
+
+    Under the Antithesis platform, `service_started` proved unreliable as
+    a readiness gate during initial container startup: docker fires it as
+    soon as the dependency's container *process* starts, before the
+    dependency's DNS entry is reliably resolvable. The first run on the
+    fault-isolated topology saw kafka hit `UnknownHostException: zookeeper`
+    148+ times in a row before its retry loop landed on a successful
+    lookup, with the same cascade downstream (schema-registry ↔ kafka).
+    Gating on the healthcheck (which probes the actual listen port)
+    eliminates that race.
+
+    Dependencies without a healthcheck (e.g. clusterd, which has no
+    readiness signal we currently expose) are left as `service_started`
+    — there's nothing to wait on.
+    """
+    services = compose.get("services", {})
+    has_healthcheck = {
+        name for name, svc in services.items() if "healthcheck" in svc
+    }
+    for svc in services.values():
+        deps = svc.get("depends_on")
+        if not isinstance(deps, dict):
+            continue
+        for dep_name, dep_spec in deps.items():
+            if (
+                isinstance(dep_spec, dict)
+                and dep_spec.get("condition") == "service_started"
+                and dep_name in has_healthcheck
+            ):
+                dep_spec["condition"] = "service_healthy"
+
+
 def register_referenced_named_volumes(compose: dict[str, Any]) -> None:
     """Declare any named volume referenced by a service that isn't already
     declared at the top level. Docker Compose rejects the file otherwise.
@@ -295,6 +331,7 @@ def main() -> None:
         assign_network(svc)
 
     declare_top_level_network(c.compose)
+    upgrade_started_to_healthy(c.compose)
     register_referenced_named_volumes(c.compose)
 
     sys.stdout.write(HEADER)

From 54fdf00287af4b099ed6184863cc9e3b4e511e05 Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Thu, 14 May 2026 16:48:00 -0400
Subject: [PATCH 55/65] test/antithesis: route every workload draw through
 Antithesis SDK
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Antithesis feedback noted that parallel_driver_parallel_workload pulls
one u64 from the SDK, seeds a stdlib `random.Random`, and then makes
every downstream decision deterministically off that seed — locking the
fuzzer out of all branches in the framework's action/expression
subtree.

Add `AntithesisRandom`, a `random.Random` subclass that overrides
`getrandbits()` and `random()` to draw from the Antithesis SDK on every
call. Plug it into `parallel_driver_parallel_workload` so action
selection, DDL choices, expression shape, sample sizes, and every other
in-framework `self.rng.*` call route through the SDK per draw. Each
worker thread gets its own instance.

Also add `random_float(low, high)` in helper_random — needed by the
follow-up commit that swarms `TOMBSTONE_PROB`/`DROP_PROBABILITY` across
invocations.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../antithesis/workload/test/helper_random.py | 95 +++++++++++++++++--
 .../test/parallel_driver_parallel_workload.py | 25 +++--
 2 files changed, 105 insertions(+), 15 deletions(-)

diff --git a/test/antithesis/workload/test/helper_random.py b/test/antithesis/workload/test/helper_random.py
index cb749227d6f17..4900778f8b6ab 100644
--- a/test/antithesis/workload/test/helper_random.py
+++ b/test/antithesis/workload/test/helper_random.py
@@ -7,11 +7,26 @@
 # the Business Source License, use of this software will be governed
 # by the Apache License, Version 2.0.
 
-"""Deterministic randomness for Antithesis drivers.
-
-All driver randomness must go through the Antithesis SDK so timelines replay
-deterministically. Outside Antithesis we fall back to the stdlib `random` with a
-fixed-but-arbitrary seed per process so local runs are not flaky.
+"""Antithesis randomness primitives for drivers.
+
+Two layers:
+
+  * Free functions (`random_u64`, `random_int`, `random_bool`, `random_choice`,
+    `random_float`) for direct use in driver code. Each call draws fresh
+    entropy from the Antithesis SDK so different timelines see different
+    values at the same call site — that's how the fuzzer drives coverage.
+
+  * `AntithesisRandom`, a `random.Random` subclass that routes every
+    `getrandbits()` and `random()` call through the SDK. Use it when
+    handing an rng to code that expects a `random.Random` (notably
+    `materialize.parallel_workload`'s `Worker`/`Action`). Seeding a stdlib
+    `random.Random` from a single SDK draw and then making every
+    subsequent decision deterministic locks the fuzzer out of every
+    branch in that subtree; this class avoids that.
+
+Outside Antithesis (e.g. snouty local validate) the SDK is unavailable;
+the helpers and the subclass fall back to a stdlib `Random` seeded from
+`os.urandom` so local runs are non-deterministic but functional.
 """
 
 from __future__ import annotations
@@ -19,7 +34,7 @@
 import os
 import random as _stdlib_random
 from collections.abc import Sequence
-from typing import TypeVar
+from typing import Any, TypeVar
 
 try:
     from antithesis import random as _ar
@@ -30,8 +45,8 @@
 
 T = TypeVar("T")
 
-# A stable per-process seed so local snouty validate runs are deterministic
-# within one process but pick a different sequence per process invocation.
+# Fallback rng for non-Antithesis runs. Seeded once at import time from
+# the OS entropy pool so each process picks a different sequence.
 _FALLBACK = _stdlib_random.Random(int.from_bytes(os.urandom(8), "little"))
 
 
@@ -60,5 +75,67 @@ def random_int(low: int, high: int) -> int:
 def random_bool(true_prob: float) -> bool:
     if not 0.0 <= true_prob <= 1.0:
         raise ValueError("true_prob out of range")
-    # Use 16 bits of entropy to avoid floating-point quirks under replay.
+    # 16 bits of entropy avoids floating-point quirks under replay.
     return (random_u64() & 0xFFFF) < int(true_prob * 0x10000)
+
+
+def random_float(low: float, high: float) -> float:
+    """Uniform draw from [low, high). Useful for swarm parameters where
+    each driver invocation should pick its own probability/weight value
+    so different timelines explore different workload mixes."""
+    if low > high:
+        raise ValueError("low > high")
+    # 53 bits is the precision of a Python float's mantissa; matches what
+    # stdlib `random.random()` returns.
+    unit = random_u64() >> 11
+    fraction = unit / (1 << 53)
+    return low + fraction * (high - low)
+
+
+class AntithesisRandom(_stdlib_random.Random):
+    """A `random.Random` whose every draw comes from the Antithesis SDK.
+
+    The CPython `random.Random` API routes `choice`, `randint`,
+    `randrange`, `sample`, `shuffle`, etc. through `getrandbits()`, and
+    `random()` is its only floating-point primitive. Overriding both
+    here means anything handed an `AntithesisRandom` exercises Antithesis
+    entropy at every decision point, not just once per seed.
+
+    Outside Antithesis we delegate to the module-level `_FALLBACK` so
+    local runs still produce values; instances share that fallback
+    rather than each carrying their own state.
+
+    `seed()` is intentionally a no-op: a Mersenne-Twister-style seed
+    isn't meaningful when entropy is supplied per-draw. `getstate` /
+    `setstate` raise because the SDK's internal state isn't observable
+    or restorable.
+    """
+
+    def random(self) -> float:
+        # Match stdlib `Random.random()` width: top 53 bits of a u64.
+        return (random_u64() >> 11) / (1 << 53)
+
+    def getrandbits(self, k: int) -> int:
+        if k <= 0:
+            raise ValueError("number of bits must be greater than zero")
+        # Pull 64-bit chunks until we have at least k bits, then shift the
+        # surplus off the bottom so the result is in [0, 2**k).
+        nchunks = (k + 63) // 64
+        bits = 0
+        for _ in range(nchunks):
+            bits = (bits << 64) | random_u64()
+        return bits >> (nchunks * 64 - k)
+
+    def seed(self, *args: Any, **kwargs: Any) -> None:
+        # Entropy comes from the SDK per call; nothing to seed.
+        return None
+
+    def getstate(self) -> Any:
+        raise NotImplementedError(
+            "AntithesisRandom has no snapshottable state; the SDK owns it"
+        )
+
+    def setstate(self, state: Any) -> None:
+        raise NotImplementedError(
+            "AntithesisRandom has no restorable state; the SDK owns it"
+        )
diff --git a/test/antithesis/workload/test/parallel_driver_parallel_workload.py b/test/antithesis/workload/test/parallel_driver_parallel_workload.py
index 4f5302c714544..945fd0805e515 100644
--- a/test/antithesis/workload/test/parallel_driver_parallel_workload.py
+++ b/test/antithesis/workload/test/parallel_driver_parallel_workload.py
@@ -40,7 +40,6 @@
 
 import logging
 import os
-import random
 import sys
 import threading
 import time
@@ -426,18 +425,27 @@ def _create_database_for_antithesis(database: Database, exe: Executor) -> None:
 
 
 def _spawn_workers(
-    rng: random.Random,
+    rng: helper_random.AntithesisRandom,
     database: Database,
     end_time: float,
     num_threads: int,
 ) -> tuple[list[Worker], list[threading.Thread]]:
     """Build the same thread pool `parallel_workload.run()` does for
-    `Complexity.DDL`, minus the per-scenario kill/cancel/backup helper."""
+    `Complexity.DDL`, minus the per-scenario kill/cancel/backup helper.
+
+    Each worker gets its own `AntithesisRandom` instance so the framework's
+    per-Action `self.rng.choice/randint/random/sample` calls route through
+    Antithesis on every draw. The framework expects a `random.Random`;
+    `AntithesisRandom` is a subclass that overrides the entropy primitives
+    to read from the SDK, so action selection, expression shape, DDL
+    choices, and every other decision are driven by the fuzzer instead of
+    being locked in after one seed.
+    """
     weights = [60, 30, 30, 30, 100]
     workers: list[Worker] = []
     threads: list[threading.Thread] = []
     for i in range(num_threads):
-        worker_rng = random.Random(rng.randrange(1_000_000))
+        worker_rng = helper_random.AntithesisRandom()
         action_list = worker_rng.choices(
             [
                 read_action_list,
@@ -475,7 +483,12 @@ def _spawn_workers(
 
 def main() -> int:
     seed = str(helper_random.random_u64())
-    rng = random.Random(seed)
+    # AntithesisRandom routes every getrandbits/random call through the
+    # Antithesis SDK, so every decision the parallel_workload framework
+    # makes downstream of this rng draws fresh entropy on each call. A
+    # stdlib `random.Random(seed)` would lock the timeline in after one
+    # draw and the fuzzer couldn't drive differing branches.
+    rng = helper_random.AntithesisRandom()
 
     LOG.info(
         "parallel-workload starting: seed=%s threads=%d runtime=%ss",
@@ -513,7 +526,7 @@ def main() -> int:
 
 def _run_invocation(
     seed: str,
-    rng: random.Random,
+    rng: helper_random.AntithesisRandom,
     cluster_name: str,
 ) -> int:
     """The bulk of `main()` once a pool slot has been claimed. Split out

From 63e10740ad74c956b5711adf614ad9ab65ceac42 Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Thu, 14 May 2026 16:52:24 -0400
Subject: [PATCH 56/65] test/antithesis: swarm tombstone / drop probabilities
 per invocation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Antithesis feedback called out hardcoded probability constants
(TOMBSTONE_PROB, DROP_PROBABILITY) as missed swarm-testing opportunities
— every timeline ran the exact same workload mix instead of letting the
fuzzer drive the parameter.

Replace the three hardcoded constants with per-invocation draws from
helper_random.random_float() over sensible ranges:

  parallel_driver_upsert_latest_value:
      TOMBSTONE_PROB 0.15  ->  random_float(0.05, 0.50)
  singleton_driver_upsert_state_rehydration:
      TOMBSTONE_PROB 0.20  ->  random_float(0.05, 0.50) (fixed per run
      so cross-cycle stability of `expected` still tests rehydration)
  singleton_driver_catalog_recovery_consistency:
      DROP_PROBABILITY 0.20 ->  random_float(0.10, 0.50)

The draw happens once at the top of main() and is logged for triage.
Each timeline ends up with a different mix; the fuzzer is free to push
toward whichever extreme reveals a bug.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../parallel_driver_upsert_latest_value.py    | 20 +++++++++++--
 ...ton_driver_catalog_recovery_consistency.py | 20 ++++++++++---
 ...ngleton_driver_upsert_state_rehydration.py | 28 +++++++++++++++----
 3 files changed, 56 insertions(+), 12 deletions(-)

diff --git a/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py b/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py
index fcfabea77620d..68734c7f03c82 100755
--- a/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py
+++ b/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py
@@ -58,7 +58,14 @@
 PRODUCES_PER_INVOCATION = 40
 DISTINCT_KEYS = 8  # small key space so we re-write the same key often
 DISTINCT_VALUES = 16
-TOMBSTONE_PROB = 0.15
+
+# Tombstone probability is drawn per-invocation in main() from a wide
+# range. Different timelines see different mixes — heavy-tombstone runs
+# stress upsert removal, mostly-live runs stress value-overwrite — and
+# the fuzzer drives which one each timeline gets. A fixed constant would
+# make every invocation identical in this respect and waste fuzzer
+# budget on the same workload shape.
+TOMBSTONE_PROB_RANGE = (0.05, 0.50)
 
 QUIET_PERIOD_S = 20
 CATCHUP_TIMEOUT_S = 60.0
@@ -113,7 +120,14 @@ def main() -> int:
     # Per-invocation prefix isolates this driver's keys from other concurrent
     # drivers and from previous invocations of this same driver.
     prefix = f"p{helper_random.random_u64():016x}"
-    LOG.info("driver starting; prefix=%s", prefix)
+
+    # Swarm: pick this invocation's tombstone fraction from the configured
+    # range. The fuzzer sees this as one of the first decisions in the
+    # timeline and can drive it toward whichever extreme reveals a bug.
+    tombstone_prob = helper_random.random_float(*TOMBSTONE_PROB_RANGE)
+    LOG.info(
+        "driver starting; prefix=%s tombstone_prob=%.3f", prefix, tombstone_prob
+    )
 
     producer, tracker = make_producer(client_id=f"antithesis-{prefix}")
 
@@ -131,7 +145,7 @@ def main() -> int:
     keys = [f"{prefix}-k{i}" for i in range(DISTINCT_KEYS)]
     for _ in range(PRODUCES_PER_INVOCATION):
         key = helper_random.random_choice(keys)
-        if helper_random.random_bool(TOMBSTONE_PROB):
+        if helper_random.random_bool(tombstone_prob):
             if expected.get(key) is not None:
                 tombstoned_after_value += 1
             _produce(producer, tracker, TOPIC_UPSERT_TEXT, key, None)
diff --git a/test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py b/test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py
index 53e791185b4ab..5612a19c30ea8 100755
--- a/test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py
+++ b/test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py
@@ -73,9 +73,13 @@
 # mid-DDL still resolves before the next cycle. CYCLE_COUNT high enough to
 # give Antithesis multiple windows to land a restart between cycles.
 CYCLE_COUNT = 10
-DROP_PROBABILITY = 0.20
 INTER_CYCLE_SLEEP_S = 2.0
 
+# Drop fraction is swarmed per-invocation in main(). Wide range so different
+# timelines exercise create-heavy (catalog grows) and drop-heavy (churn-
+# through-recovery) modes without rebuilding the driver.
+DROP_PROBABILITY_RANGE = (0.10, 0.50)
+
 PROBE_CONNECT_TIMEOUT_S = 2.0
 
 
@@ -134,6 +138,7 @@ def _run_cycle(
     name_prefix: str,
     cycle_idx: int,
     next_id: int,
+    drop_probability: float,
 ) -> tuple[bool, int]:
     """One create-or-drop + verify cycle.
 
@@ -150,7 +155,7 @@ def _run_cycle(
     missing from the post-recovery catalog.
     """
     new_id = next_id
-    if expected and helper_random.random_bool(DROP_PROBABILITY):
+    if expected and helper_random.random_bool(drop_probability):
         # Drop a random existing table. Choosing from `expected` keeps the
         # drop deterministic w.r.t. the local model.
         table = sorted(expected)[helper_random.random_int(0, len(expected) - 1)]
@@ -210,7 +215,12 @@ def main() -> int:
     # Per-timeline namespace so concurrent timelines and any future
     # parallel_driver_ instances do not collide on table names.
     name_prefix = f"catrec_{helper_random.random_u64():016x}"
-    LOG.info("catalog recovery driver starting; name_prefix=%s", name_prefix)
+    drop_probability = helper_random.random_float(*DROP_PROBABILITY_RANGE)
+    LOG.info(
+        "catalog recovery driver starting; name_prefix=%s drop_probability=%.3f",
+        name_prefix,
+        drop_probability,
+    )
 
     expected: set[str] = set()
     next_id = 0
@@ -218,7 +228,9 @@ def main() -> int:
     saw_coord_unavailable = False
 
     for cycle_idx in range(CYCLE_COUNT):
-        ran, next_id = _run_cycle(expected, name_prefix, cycle_idx, next_id)
+        ran, next_id = _run_cycle(
+            expected, name_prefix, cycle_idx, next_id, drop_probability
+        )
         if ran:
             cycles_ran += 1
         if _saw_coord_unavailable():
diff --git a/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py b/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py
index 26342d0ed43e8..3c9876ba79988 100755
--- a/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py
+++ b/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py
@@ -81,7 +81,14 @@
 PRODUCES_PER_CYCLE = 30
 DISTINCT_KEYS = 6
 DISTINCT_VALUES = 12
-TOMBSTONE_PROB = 0.20
+
+# Tombstone fraction is swarmed once per driver invocation (see main()) so
+# different timelines exercise different live/dead mixes — heavy-tombstone
+# runs stress the upsert-state-remove rehydration path, mostly-live runs
+# stress value-overwrite rehydration. The choice is fixed for the whole
+# driver lifetime so cross-cycle stability of `expected` still tests
+# rehydration, not just per-cycle convergence.
+TOMBSTONE_PROB_RANGE = (0.05, 0.50)
 
 QUIET_PERIOD_S = 25
 CATCHUP_TIMEOUT_S = 120.0
@@ -113,7 +120,11 @@ def _select_value_for_key(key: str) -> tuple[bool, str | None]:
 
 
 def _run_cycle(
-    producer, tracker, expected: dict[str, str | None], cycle_idx: int
+    producer,
+    tracker,
+    expected: dict[str, str | None],
+    cycle_idx: int,
+    tombstone_prob: float,
 ) -> bool:
     """Produce one batch, settle, and assert state for every tracked key.
 
@@ -122,7 +133,7 @@ def _run_cycle(
     keys = [f"reh-k{i}" for i in range(DISTINCT_KEYS)]
     for _ in range(PRODUCES_PER_CYCLE):
         key = helper_random.random_choice(keys)
-        if helper_random.random_bool(TOMBSTONE_PROB):
+        if helper_random.random_bool(tombstone_prob):
             producer.produce(
                 topic=TOPIC_UPSERT_TEXT,
                 key=key.encode("utf-8"),
@@ -201,7 +212,14 @@ def _run_cycle(
 
 def main() -> int:
     ensure_upsert_text_source()
-    LOG.info("rehydration driver starting; %d cycles planned", CYCLE_COUNT)
+    # Swarm once per invocation, fixed for the run so cross-cycle stability
+    # of `expected` keeps testing rehydration rather than per-cycle drift.
+    tombstone_prob = helper_random.random_float(*TOMBSTONE_PROB_RANGE)
+    LOG.info(
+        "rehydration driver starting; %d cycles planned tombstone_prob=%.3f",
+        CYCLE_COUNT,
+        tombstone_prob,
+    )
 
     producer, tracker = make_producer(client_id="antithesis-rehydration")
     expected: dict[str, str | None] = {}
@@ -209,7 +227,7 @@ def main() -> int:
     cycles_run = 0
 
     for cycle_idx in range(CYCLE_COUNT):
-        if _run_cycle(producer, tracker, expected, cycle_idx):
+        if _run_cycle(producer, tracker, expected, cycle_idx, tombstone_prob):
             cycles_run += 1
         time.sleep(INTER_CYCLE_SLEEP_S)
 

From 6479ea8f391f05a584dcdae30b3dd0cc4e357b9a Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Thu, 14 May 2026 16:54:44 -0400
Subject: [PATCH 57/65] test/antithesis: move quiet/active windows to a global
 fault-orchestrator
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Antithesis feedback: with every parallel driver requesting its own
`ANTITHESIS_STOP_FAULTS` window, the union of overlapping per-driver
quiet periods leaves the SUT mostly un-faulted. Faults should arrive
on a single coordinated cadence driven from a dedicated container, and
workloads should stay robust to whatever quiet/faulting transitions
the orchestrator picks — the catchup-then-assert pattern already there
fits that model.

Topology change: add a `fault-orchestrator` service backed by `bash:5`
running `test/antithesis/fault-orchestrator/pause_faults.sh`, adapted
from the Antithesis hands-on tutorial. It alternates
faults-OFF/faults-ON windows at randomised intervals
(START_DELAY=30, MIN_ON/MAX_ON/MIN_OFF/MAX_OFF=20-40), centralising the
cadence. Outside Antithesis (`ANTITHESIS_STOP_FAULTS` unset) the script
no-ops so snouty local validate still works.

The script is loaded via `Path(...).read_text()` inside
`FaultOrchestrator(Service)`; every `$` is doubled to `$$` before
embedding into the compose YAML so docker-compose's parse-time
variable interpolation doesn't eat shell references like `${RANDOM}`
or `${ANTITHESIS_STOP_FAULTS}`. The on-disk .sh file stays plain bash
so shellcheck and direct execution still work.

Driver-side: delete `helper_quiet.py` and every `request_quiet_period`
call site (9 drivers). Each driver's `wait_for_catchup` timeout (or
the equivalent FINAL_READ_TIMEOUT_S in the strict-serializable driver)
is bumped to span at least one MAX_OFF window plus catchup overhead
— concretely 90s for the short Kafka/MV/upsert drivers, 120s for
MySQL CDC paths, and 180s for the singleton rehydration driver which
must survive a clusterd kill landing inside a quiet window. Liveness
`sometimes(...)` anchor messages were renamed
"after quiet period" → "within catchup budget" to match the new
semantics; scratchbook docs that quoted the exact strings are updated
to match.

Regenerate test/antithesis/config/docker-compose.yaml via
export-compose.py.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 test/antithesis/config/docker-compose.yaml    | 64 ++++++++++++++++
 .../fault-orchestrator/pause_faults.sh        | 76 +++++++++++++++++++
 test/antithesis/mzcompose.py                  | 75 ++++++++++++++++++
 .../properties/kafka-source-no-data-loss.md   |  2 +-
 .../mysql-myisam-cdc-no-data-loss.md          |  2 +-
 .../properties/mysql-source-no-data-loss.md   |  2 +-
 .../upsert-key-reflects-latest-value.md       |  4 +-
 .../scratchbook/property-catalog.md           |  4 +-
 test/antithesis/workload/test/helper_quiet.py | 38 ----------
 .../parallel_driver_kafka_none_envelope.py    | 18 +++--
 ...rallel_driver_mv_reflects_table_updates.py | 20 ++---
 .../test/parallel_driver_mysql_cdc.py         | 18 ++---
 .../test/parallel_driver_mysql_myisam.py      | 17 ++---
 .../test/parallel_driver_parallel_workload.py | 20 +++--
 ...rallel_driver_strict_serializable_reads.py | 18 +++--
 ...llel_driver_upsert_ancient_key_writable.py | 13 ++--
 .../parallel_driver_upsert_latest_value.py    | 32 ++++----
 ...ton_driver_catalog_recovery_consistency.py |  3 +-
 ...ngleton_driver_upsert_state_rehydration.py | 39 +++++-----
 19 files changed, 332 insertions(+), 133 deletions(-)
 create mode 100755 test/antithesis/fault-orchestrator/pause_faults.sh
 delete mode 100644 test/antithesis/workload/test/helper_quiet.py

diff --git a/test/antithesis/config/docker-compose.yaml b/test/antithesis/config/docker-compose.yaml
index 8201cb9a11e7f..ecd6eea161e07 100644
--- a/test/antithesis/config/docker-compose.yaml
+++ b/test/antithesis/config/docker-compose.yaml
@@ -619,6 +619,70 @@ services:
     container_name: materialized
     networks:
     - antithesis-net
+  fault-orchestrator:
+    image: bash:5
+    entrypoint:
+    - bash
+    - -s
+    command:
+    - "#!/usr/bin/env bash\n\n# Copyright Materialize, Inc. and contributors. All\
+      \ rights reserved.\n#\n# Use of this software is governed by the Business Source\
+      \ License\n# included in the LICENSE file at the root of this repository.\n\
+      #\n# As of the Change Date specified in that file, in accordance with\n# the\
+      \ Business Source License, use of this software will be governed\n# by the Apache\
+      \ License, Version 2.0.\n\n# Drive Antithesis fault windows globally.\n#\n#\
+      \ Antithesis injects faults into the system continuously by default.\n# Calling\
+      \ `ANTITHESIS_STOP_FAULTS <seconds>` requests a quiet window \u2014\n# Antithesis\
+      \ pauses fault injection for that many seconds. The Antithesis\n# engagement\
+      \ team's recommendation: drive these quiet windows from a\n# single dedicated\
+      \ container, not per-driver, otherwise overlapping\n# per-driver requests keep\
+      \ the system in a quiet state most of the time\n# and we never actually fault.\n\
+      #\n# This script alternates faults-OFF (quiet) and faults-ON (active)\n# windows\
+      \ at randomized intervals so each timeline sees a different\n# cadence. Adapted\
+      \ from the Antithesis hands-on tutorial:\n#   https://github.com/antithesishq/hands-on-tutorial-1/blob/main/python/antithesis/pause_faults.sh\n\
+      #\n# Outside Antithesis (snouty local validate) `ANTITHESIS_STOP_FAULTS` is\n\
+      # unset; the script exits immediately so the rest of the compose works.\n\n\
+      set -euo pipefail\n\nif [[ -z \"$${ANTITHESIS_STOP_FAULTS:-}\" ]]; then\n  \
+      \  echo \"ANTITHESIS_STOP_FAULTS not set; fault-orchestrator exiting (no-op)\"\
+      \n    exit 0\nfi\n\n# Tunable via the service `environment:` block. Defaults\
+      \ sized so that:\n#   * MAX_ON is comfortably shorter than any driver's CATCHUP_TIMEOUT_S\n\
+      #     (smallest is 60s in parallel_driver_upsert_latest_value) \u2014 a\n# \
+      \    driver's catchup window can always span at least one full quiet\n#    \
+      \ period.\n#   * MIN_OFF is long enough for materialized to commit a few timestamps\n\
+      #     and for sources to advance offset_committed past the most recent\n#  \
+      \   batch of produced offsets.\n#   * START_DELAY gives setup-complete + bootstrap\
+      \ a window of un-faulted\n#     time before the alternation begins.\nSTART_DELAY=\"\
+      $${START_DELAY:-30}\"\nMIN_ON=\"$${MIN_ON:-20}\"\nMAX_ON=\"$${MAX_ON:-40}\"\n\
+      MIN_OFF=\"$${MIN_OFF:-20}\"\nMAX_OFF=\"$${MAX_OFF:-40}\"\n\necho \"fault-orchestrator:\
+      \ ON $${MIN_ON}-$${MAX_ON}s / OFF $${MIN_OFF}-$${MAX_OFF}s, initial pause $${START_DELAY}s\"\
+      \n\n# Initial quiet window so the rest of the stack reaches steady state\n#\
+      \ before Antithesis starts faulting. Antithesis may or may not honour\n# this\
+      \ depending on when fault injection begins relative to setup-\n# complete; either\
+      \ way the local sleep gives drivers a clean start.\n\"$${ANTITHESIS_STOP_FAULTS}\"\
+      \ \"$${START_DELAY}\"\nsleep \"$${START_DELAY}\"\n\nwhile true; do\n    # Re-seed\
+      \ $$RANDOM from /dev/urandom so successive iterations don't\n    # repeat the\
+      \ same on/off period (the shell's RANDOM is a 16-bit LCG;\n    # without reseeding\
+      \ it can produce predictable sequences).\n    RANDOM=$$(od -An -N2 -tu2 /dev/urandom\
+      \ | tr -d ' ')\n    ON_PERIOD=$$((MIN_ON + (RANDOM % (MAX_ON - MIN_ON + 1))))\n\
+      \    OFF_PERIOD=$$((MIN_OFF + (RANDOM % (MAX_OFF - MIN_OFF + 1))))\n\n    echo\
+      \ \"fault-orchestrator: faults OFF for $${OFF_PERIOD}s\"\n    \"$${ANTITHESIS_STOP_FAULTS}\"\
+      \ \"$${OFF_PERIOD}\"\n    sleep \"$${OFF_PERIOD}\"\n\n    echo \"fault-orchestrator:\
+      \ faults ON for $${ON_PERIOD}s\"\n    sleep \"$${ON_PERIOD}\"\ndone\n"
+    environment:
+    - START_DELAY=30
+    - MIN_ON=20
+    - MAX_ON=40
+    - MIN_OFF=20
+    - MAX_OFF=40
+    depends_on:
+      materialized:
+        condition: service_healthy
+    restart: 'no'
+    platform: linux/amd64
+    container_name: fault-orchestrator
+    hostname: fault-orchestrator
+    networks:
+    - antithesis-net
   workload:
     depends_on:
       materialized:
diff --git a/test/antithesis/fault-orchestrator/pause_faults.sh b/test/antithesis/fault-orchestrator/pause_faults.sh
new file mode 100755
index 0000000000000..00cb4e910bc47
--- /dev/null
+++ b/test/antithesis/fault-orchestrator/pause_faults.sh
@@ -0,0 +1,76 @@
+#!/usr/bin/env bash
+
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+# Drive Antithesis fault windows globally.
+#
+# Antithesis injects faults into the system continuously by default.
+# Calling `ANTITHESIS_STOP_FAULTS <seconds>` requests a quiet window —
+# Antithesis pauses fault injection for that many seconds. The Antithesis
+# engagement team's recommendation: drive these quiet windows from a
+# single dedicated container, not per-driver, otherwise overlapping
+# per-driver requests keep the system in a quiet state most of the time
+# and we never actually fault.
+#
+# This script alternates faults-OFF (quiet) and faults-ON (active)
+# windows at randomized intervals so each timeline sees a different
+# cadence. Adapted from the Antithesis hands-on tutorial:
+#   https://github.com/antithesishq/hands-on-tutorial-1/blob/main/python/antithesis/pause_faults.sh
+#
+# Outside Antithesis (snouty local validate) `ANTITHESIS_STOP_FAULTS` is
+# unset; the script exits immediately so the rest of the compose works.
+
+set -euo pipefail
+
+if [[ -z "${ANTITHESIS_STOP_FAULTS:-}" ]]; then
+    echo "ANTITHESIS_STOP_FAULTS not set; fault-orchestrator exiting (no-op)"
+    exit 0
+fi
+
+# Tunable via the service `environment:` block. Defaults sized so that:
+#   * MAX_ON is comfortably shorter than any driver's CATCHUP_TIMEOUT_S
+#     (smallest is 60s in parallel_driver_upsert_latest_value) — a
+#     driver's catchup window can always span at least one full quiet
+#     period.
+#   * MIN_OFF is long enough for materialized to commit a few timestamps
+#     and for sources to advance offset_committed past the most recent
+#     batch of produced offsets.
+#   * START_DELAY gives setup-complete + bootstrap a window of un-faulted
+#     time before the alternation begins.
+START_DELAY="${START_DELAY:-30}"
+MIN_ON="${MIN_ON:-20}"
+MAX_ON="${MAX_ON:-40}"
+MIN_OFF="${MIN_OFF:-20}"
+MAX_OFF="${MAX_OFF:-40}"
+
+echo "fault-orchestrator: ON ${MIN_ON}-${MAX_ON}s / OFF ${MIN_OFF}-${MAX_OFF}s, initial pause ${START_DELAY}s"
+
+# Initial quiet window so the rest of the stack reaches steady state
+# before Antithesis starts faulting. Antithesis may or may not honour
+# this depending on when fault injection begins relative to setup-
+# complete; either way the local sleep gives drivers a clean start.
+"${ANTITHESIS_STOP_FAULTS}" "${START_DELAY}"
+sleep "${START_DELAY}"
+
+while true; do
+    # Re-seed $RANDOM from /dev/urandom so successive iterations don't
+    # repeat the same on/off period (the shell's RANDOM is a 16-bit LCG;
+    # without reseeding it can produce predictable sequences).
+    RANDOM=$(od -An -N2 -tu2 /dev/urandom | tr -d ' ')
+    ON_PERIOD=$((MIN_ON + (RANDOM % (MAX_ON - MIN_ON + 1))))
+    OFF_PERIOD=$((MIN_OFF + (RANDOM % (MAX_OFF - MIN_OFF + 1))))
+
+    echo "fault-orchestrator: faults OFF for ${OFF_PERIOD}s"
+    "${ANTITHESIS_STOP_FAULTS}" "${OFF_PERIOD}"
+    sleep "${OFF_PERIOD}"
+
+    echo "fault-orchestrator: faults ON for ${ON_PERIOD}s"
+    sleep "${ON_PERIOD}"
+done
diff --git a/test/antithesis/mzcompose.py b/test/antithesis/mzcompose.py
index bfbc1abd6d4ca..0f333afeee8e4 100644
--- a/test/antithesis/mzcompose.py
+++ b/test/antithesis/mzcompose.py
@@ -32,6 +32,13 @@
                         from the harness; defaults to 8).
   - materialized      : the SUT (environmentd; clusterd is external)
   - workload          : Python test driver wired to the Antithesis SDK
+  - fault-orchestrator : single bash container alternating quiet and
+                        faulting windows globally via
+                        `ANTITHESIS_STOP_FAULTS`. Centralising the
+                        cadence avoids the failure mode where every
+                        driver requests its own quiet window and the
+                        union of overlapping requests keeps the system
+                        in a quiet state most of the time.
 
 Usage:
   bin/mzcompose --find antithesis run default                       # bring up the cluster
@@ -39,6 +46,7 @@
 """
 
 import os
+from pathlib import Path
 
 from materialize.mzcompose.composition import Composition
 from materialize.mzcompose.service import Service, ServiceConfig
@@ -75,6 +83,71 @@
 CLUSTERD_WORKERS = 16
 
 
+class FaultOrchestrator(Service):
+    """Single bash container that drives Antithesis fault windows globally.
+
+    Invokes `${ANTITHESIS_STOP_FAULTS} <seconds>` to open quiet windows,
+    then sleeps through faults-ON windows, on a randomised cadence
+    (MIN_ON..MAX_ON / MIN_OFF..MAX_OFF). The script is bundled in
+    `test/antithesis/fault-orchestrator/pause_faults.sh` and inlined into
+    the compose `command:` here so we don't need a new mzbuild image
+    just to ship 30 lines of bash.
+
+    The Antithesis engagement team flagged per-driver quiet-period
+    requests as an anti-pattern: with many concurrent drivers each
+    asking for a quiet window, the union of overlapping windows leaves
+    the SUT mostly un-faulted. Centralising the cadence here means
+    faults arrive in one coordinated rhythm; drivers stay robust to
+    quiet/faulting transitions by relying on `wait_for_catchup` with
+    generous timeouts.
+
+    Outside Antithesis `ANTITHESIS_STOP_FAULTS` is unset and the script
+    exits immediately, so this service is a no-op for local validate.
+    """
+
+    def __init__(self) -> None:
+        script_path = Path(__file__).parent / "fault-orchestrator" / "pause_faults.sh"
+        # Compose interpolates `${VAR}` in every string value at parse
+        # time, which would eat the script's shell variable references
+        # (`${RANDOM}`, `${MIN_ON}`, `${ANTITHESIS_STOP_FAULTS}`, etc.)
+        # before bash ever sees them. Double the `$` to pass through a
+        # literal `$` and let bash do its own expansion at runtime. The
+        # underlying .sh file stays normal so shellcheck and direct
+        # execution work.
+        script = script_path.read_text().replace("$", "$$")
+        config: ServiceConfig = {
+            # bash:5 is alpine-based and ships `bash`, `od`, `tr`, and
+            # `sleep` via busybox — everything the script uses. Public
+            # image, so it sails through export-compose.py untouched.
+            "image": "bash:5",
+            # `bash -s` reads the script from stdin via a here-string;
+            # keeps the YAML readable instead of one giant `-c` blob.
+            "entrypoint": ["bash", "-s"],
+            "command": [script],
+            "environment": [
+                # Defaults chosen so MAX_ON stays well under the smallest
+                # driver's CATCHUP_TIMEOUT_S (currently 90s) — every
+                # driver lifetime has a chance to span at least one quiet
+                # window.
+                "START_DELAY=30",
+                "MIN_ON=20",
+                "MAX_ON=40",
+                "MIN_OFF=20",
+                "MAX_OFF=40",
+            ],
+            # Wait for materialized so the orchestrator's first
+            # ANTITHESIS_STOP_FAULTS call doesn't precede the SUT being
+            # ready. Timing is not safety-critical: Antithesis only
+            # starts injecting faults after setup-complete fires from
+            # the workload container.
+            "depends_on": {
+                "materialized": {"condition": "service_healthy"},
+            },
+            "restart": "no",
+        }
+        super().__init__(name="fault-orchestrator", config=config)
+
+
 class Workload(Service):
     """Antithesis workload client — Python test driver."""
 
@@ -233,6 +306,7 @@ def __init__(self) -> None:
             "unsafe_enable_unorchestrated_cluster_replicas": "true",
         },
     ),
+    FaultOrchestrator(),
     Workload(),
 ]
 
@@ -253,4 +327,5 @@ def workflow_default(c: Composition) -> None:
         "mysql-replica",
     )
     c.up("materialized")
+    c.up("fault-orchestrator")
     c.up("workload")
diff --git a/test/antithesis/scratchbook/properties/kafka-source-no-data-loss.md b/test/antithesis/scratchbook/properties/kafka-source-no-data-loss.md
index e999c42b76083..af21f23e6665b 100644
--- a/test/antithesis/scratchbook/properties/kafka-source-no-data-loss.md
+++ b/test/antithesis/scratchbook/properties/kafka-source-no-data-loss.md
@@ -43,7 +43,7 @@ Implemented 2026-05-11 (NONE envelope, workload-side) as `test/antithesis/worklo
 
 | Message | Type | Fires when |
 |---------|------|------------|
-| `"kafka source caught up to produced offsets after quiet period (none envelope)"` | `sometimes` | Once per invocation after `wait_for_catchup`; the liveness anchor |
+| `"kafka source caught up to produced offsets within catchup budget (none envelope)"` | `sometimes` | Once per invocation after `wait_for_catchup`; the liveness anchor |
 | `"kafka source: every produced payload is visible exactly once"` | `always` | Per produced payload, after catchup; carries `payload`, `present`, `observed_count` in details |
 
 The UPSERT-envelope arm of this property is covered by `upsert-key-reflects-latest-value`.
diff --git a/test/antithesis/scratchbook/properties/mysql-myisam-cdc-no-data-loss.md b/test/antithesis/scratchbook/properties/mysql-myisam-cdc-no-data-loss.md
index 5e5b6fd239f0e..c84150c26cde9 100644
--- a/test/antithesis/scratchbook/properties/mysql-myisam-cdc-no-data-loss.md
+++ b/test/antithesis/scratchbook/properties/mysql-myisam-cdc-no-data-loss.md
@@ -72,7 +72,7 @@ Implemented as `test/antithesis/workload/test/parallel_driver_mysql_myisam.py`.
 |---------|------|------------|
 | `"mysql myisam: CDC source row has correct value after catchup"` | `always` | Per row, after catchup. False ⟺ row missing or value wrong. |
 | `"mysql myisam: CDC source row count matches inserted count after catchup"` | `always` | Per invocation, after catchup. False ⟺ extra or missing rows for this batch. |
-| `"mysql myisam: CDC source caught up to all primary inserts after quiet period"` | `sometimes` | Per invocation. Liveness for the catchup gate. |
+| `"mysql myisam: CDC source caught up to all primary inserts within catchup budget"` | `sometimes` | Per invocation. Liveness for the catchup gate. |
 | `"mysql replica: both cdc_test tables replicated from primary within 90s"` | `sometimes` | Per timeline (fires once from `first_mysql_replica_setup`). Confirms replication is flowing for both engines. |
 
 Knobs: `ROWS_PER_INVOCATION=20`, `QUIET_PERIOD_S=25`, `CATCHUP_TIMEOUT_S=90.0`.
diff --git a/test/antithesis/scratchbook/properties/mysql-source-no-data-loss.md b/test/antithesis/scratchbook/properties/mysql-source-no-data-loss.md
index 19f6d02d68974..dd707894f61bf 100644
--- a/test/antithesis/scratchbook/properties/mysql-source-no-data-loss.md
+++ b/test/antithesis/scratchbook/properties/mysql-source-no-data-loss.md
@@ -26,7 +26,7 @@ Each `parallel_driver_` invocation:
 3. Requests an Antithesis quiet period (25 s) and polls `antithesis_cdc` in
    Materialize until all expected rows appear or the 90 s budget expires.
 4. Fires:
-   - `sometimes("mysql: CDC source caught up to all primary inserts after quiet period", …)`
+   - `sometimes("mysql: CDC source caught up to all primary inserts within catchup budget", …)`
      — liveness anchor; confirms at least one invocation reaches full catchup.
    - `always("mysql: CDC source row has correct value after catchup", …)` — safety;
      fired once per row, catches wrong-value corruption.
diff --git a/test/antithesis/scratchbook/properties/upsert-key-reflects-latest-value.md b/test/antithesis/scratchbook/properties/upsert-key-reflects-latest-value.md
index 90341358df926..d504357e685ee 100644
--- a/test/antithesis/scratchbook/properties/upsert-key-reflects-latest-value.md
+++ b/test/antithesis/scratchbook/properties/upsert-key-reflects-latest-value.md
@@ -52,9 +52,9 @@ Implemented 2026-05-11 as `test/antithesis/workload/test/parallel_driver_upsert_
 |---------|------|------|
 | `"upsert: SELECT for key matches latest produced value"` | `always` | Per sampled live key after quiet-period catchup |
 | `"upsert: tombstoned key has no row in source"` | `always` | Per sampled key whose last produced message was a tombstone |
-| `"upsert: source caught up to produced offsets after quiet period"` | `sometimes` | Once per invocation; liveness anchor proving the safety assertions ran against settled data |
+| `"upsert: source caught up to produced offsets within catchup budget"` | `sometimes` | Once per invocation; liveness anchor proving the safety assertions ran against settled data |
 
-Shared helpers introduced for this property and reusable by every subsequent Kafka source property: `helper_pg.py` (resilient pgwire), `helper_kafka.py` (producer + delivery tracker), `helper_quiet.py` (`ANTITHESIS_STOP_FAULTS` wrapper), `helper_random.py` (deterministic randomness with Antithesis SDK), `helper_source_stats.py` (catchup polling on `mz_internal.mz_source_statistics`), `helper_upsert_source.py` (idempotent `CREATE CONNECTION` + `CREATE SOURCE`).
+Shared helpers introduced for this property and reusable by every subsequent Kafka source property: `helper_pg.py` (resilient pgwire), `helper_kafka.py` (producer + delivery tracker), `helper_random.py` (Antithesis SDK randomness, including an `AntithesisRandom` subclass for code expecting a `random.Random`), `helper_source_stats.py` (catchup polling on `mz_internal.mz_source_statistics`), `helper_upsert_source.py` (idempotent `CREATE CONNECTION` + `CREATE SOURCE`). Quiet windows are driven globally by a `fault-orchestrator` service (alternating randomized faults-ON / faults-OFF intervals); drivers no longer call `ANTITHESIS_STOP_FAULTS` themselves and rely on `wait_for_catchup` with a budget sized to span one quiet window.
 
 No SUT-side instrumentation added in this pass — that is the candidate work in `properties/upsert-no-internal-panic.md`, `properties/upsert-state-consolidation-wellformed.md`, and `properties/upsert-ensure-decoded-called-before-access.md`.
 
diff --git a/test/antithesis/scratchbook/property-catalog.md b/test/antithesis/scratchbook/property-catalog.md
index ec139c3ea4ae9..067a63f755e8c 100644
--- a/test/antithesis/scratchbook/property-catalog.md
+++ b/test/antithesis/scratchbook/property-catalog.md
@@ -294,7 +294,7 @@ Properties specific to the Kafka source ingestion pipeline: `KafkaSourceReader`
 |---|---|
 | **Type** | Safety |
 | **Priority** | P0 — the entire user-visible promise of the UPSERT envelope |
-| **Status** | **Implemented** (workload-side) — `test/antithesis/workload/test/parallel_driver_upsert_latest_value.py`. Two `always()` assertions ("upsert: SELECT for key matches latest produced value", "upsert: tombstoned key has no row in source") plus one `sometimes()` liveness anchor ("upsert: source caught up to produced offsets after quiet period"). |
+| **Status** | **Implemented** (workload-side) — `test/antithesis/workload/test/parallel_driver_upsert_latest_value.py`. Two `always()` assertions ("upsert: SELECT for key matches latest produced value", "upsert: tombstoned key has no row in source") plus one `sometimes()` liveness anchor ("upsert: source caught up to produced offsets within catchup budget"). |
 | **Property** | At a settled timestamp, for each key produced by the workload, the UPSERT source contains exactly the value from the last `(key, value)` message produced — or no row if the last message for that key was a tombstone. |
 | **Invariant** | `Always`: for every workload-tracked key, `SELECT value FROM source WHERE key = ?` returns the expected value (or empty for tombstoned keys), as determined by the workload's local model of what it produced. Checked after `ANTITHESIS_STOP_FAULTS` quiet periods. |
 | **Antithesis Angle** | Reorder produce timing, kill clusterd between the prior-value lookup (`multi_get`) and the new-value write (`multi_put`), inject delays in the feedback-driven snapshot phase. Tests order-key monotonicity (commit f177db8286), state-backend consistency, and snapshot-completion correctness. |
@@ -437,7 +437,7 @@ commit-order preservation) to the Antithesis environment.
 |---|---|
 | **Type** | Liveness + Safety |
 | **Priority** | P1 — end-to-end correctness of the MySQL CDC pipeline; tests a distinct code path from Kafka |
-| **Status** | **Implemented (workload-side)** — `test/antithesis/workload/test/parallel_driver_mysql_cdc.py` + `first_mysql_replica_setup.py`. Each `parallel_driver_` invocation inserts 20 rows to MySQL primary, waits for a quiet period, then polls `antithesis_cdc` until all rows appear (or 90 s budget expires). `always("mysql: CDC source row has correct value after catchup", …)` and `always("mysql: CDC source row count matches inserted count after catchup", …)` fire per-row and per-batch after confirmed catchup; `sometimes("mysql: CDC source caught up to all primary inserts after quiet period", …)` is the liveness anchor. The `first_mysql_replica_setup.py` creates the MySQL schema, configures multithreaded replication (4 workers, `replica_preserve_commit_order=ON`), and creates the Materialize connection/source/table, firing `reachable("mysql: first-run setup complete …")` as a coverage anchor. |
+| **Status** | **Implemented (workload-side)** — `test/antithesis/workload/test/parallel_driver_mysql_cdc.py` + `first_mysql_replica_setup.py`. Each `parallel_driver_` invocation inserts 20 rows to MySQL primary, waits for a quiet period, then polls `antithesis_cdc` until all rows appear (or 90 s budget expires). `always("mysql: CDC source row has correct value after catchup", …)` and `always("mysql: CDC source row count matches inserted count after catchup", …)` fire per-row and per-batch after confirmed catchup; `sometimes("mysql: CDC source caught up to all primary inserts within catchup budget", …)` is the liveness anchor. The `first_mysql_replica_setup.py` creates the MySQL schema, configures multithreaded replication (4 workers, `replica_preserve_commit_order=ON`), and creates the Materialize connection/source/table, firing `reachable("mysql: first-run setup complete …")` as a coverage anchor. |
 | **Property** | After inserting a row to the MySQL primary (via the binlog + GTID-based multithreaded replica), the Materialize CDC source eventually contains that row with the correct value. |
 | **Invariant** | `Always`: after catchup, for every row inserted to `antithesis.cdc_test` on the primary, `SELECT value FROM antithesis_cdc WHERE id = ?` returns the expected value. `Sometimes`: catchup completes within the quiet-period budget at least once per run. |
 | **Antithesis Angle** | Kills to the MySQL replica container (replica restarts from persisted GTID position); kills to the MySQL primary (replica and Materialize source must handle upstream silence gracefully); clusterd restarts (MySQL CDC resume exercises the same `storage-command-replay-idempotent` path as Kafka); parallel worker scheduling jitter that stresses the `replica_preserve_commit_order` protocol. |
diff --git a/test/antithesis/workload/test/helper_quiet.py b/test/antithesis/workload/test/helper_quiet.py
deleted file mode 100644
index adb4f9ead3e6d..0000000000000
--- a/test/antithesis/workload/test/helper_quiet.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright Materialize, Inc. and contributors. All rights reserved.
-#
-# Use of this software is governed by the Business Source License
-# included in the LICENSE file at the root of this repository.
-#
-# As of the Change Date specified in that file, in accordance with
-# the Business Source License, use of this software will be governed
-# by the Apache License, Version 2.0.
-
-"""Wrapper around the Antithesis ANTITHESIS_STOP_FAULTS binary.
-
-Outside Antithesis (e.g. snouty local validate), the env var is unset and this
-becomes a no-op so the workload still runs end-to-end.
-"""
-
-from __future__ import annotations
-
-import logging
-import os
-import subprocess
-
-LOG = logging.getLogger("antithesis.helper_quiet")
-
-
-def request_quiet_period(seconds: int) -> bool:
-    """Request that Antithesis pause all faults for `seconds`.
-
-    Returns True if the request was issued, False if not in Antithesis. Either
-    way callers must still poll for the system to stabilize — the binary
-    returns immediately and the actual quiet window unfolds asynchronously.
-    """
-    binary = os.environ.get("ANTITHESIS_STOP_FAULTS")
-    if not binary:
-        LOG.info("ANTITHESIS_STOP_FAULTS not set; skipping quiet-period request")
-        return False
-    LOG.info("requesting %ds quiet period via %s", seconds, binary)
-    subprocess.run([binary, str(seconds)], check=False)
-    return True
diff --git a/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py b/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py
index bbb4e2529eca8..656a6b6b6d776 100755
--- a/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py
+++ b/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py
@@ -24,8 +24,10 @@
      workload can filter the source down to its own rows when asserting.
   3. Produces N distinct payloads, recording the broker-assigned `(partition,
      offset)` for each via the delivery callback.
-  4. Requests an Antithesis quiet period and waits for `offset_committed`
-     to reach the highest produced offset.
+  4. Waits for `offset_committed` to reach the highest produced offset.
+     The global fault-orchestrator service drives quiet/active windows
+     on its own cadence; the catchup timeout is sized to span at least
+     one quiet window so the source can advance during it.
   5. Runs two `assert_always` checks:
        - "kafka source: no duplicate (partition, offset)" — `GROUP BY 1, 2 HAVING COUNT(*) > 1` is empty
        - "kafka source: every produced payload is visible exactly once" —
@@ -45,7 +47,6 @@
 import sys
 
 import helper_random
-from antithesis.assertions import always, sometimes
 from helper_kafka import make_producer
 from helper_none_source import (
     SOURCE_NONE_TEXT,
@@ -53,9 +54,10 @@
     ensure_none_text_source,
 )
 from helper_pg import query_retry
-from helper_quiet import request_quiet_period
 from helper_source_stats import wait_for_catchup
 
+from antithesis.assertions import always, sometimes
+
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
 )
@@ -65,8 +67,9 @@
 # — Antithesis launches the driver many times and accumulates coverage
 # across invocations, not within one giant batch.
 PRODUCES_PER_INVOCATION = 50
-QUIET_PERIOD_S = 20
-CATCHUP_TIMEOUT_S = 60.0
+# Sized to span at least one MAX_OFF window from the global fault-
+# orchestrator (default 40s) plus enough buffer for catchup itself.
+CATCHUP_TIMEOUT_S = 90.0
 
 
 def main() -> int:
@@ -114,14 +117,13 @@ def main() -> int:
     # source query below joins payloads back to (partition, offset)
     # assignments without us needing to track them at produce time.
 
-    request_quiet_period(QUIET_PERIOD_S)
     caught_up = wait_for_catchup(
         SOURCE_NONE_TEXT, max_produced, timeout_s=CATCHUP_TIMEOUT_S
     )
 
     sometimes(
         caught_up,
-        "kafka source caught up to produced offsets after quiet period (none envelope)",
+        "kafka source caught up to produced offsets within catchup budget (none envelope)",
         {"source": SOURCE_NONE_TEXT, "target_offset": max_produced},
     )
 
diff --git a/test/antithesis/workload/test/parallel_driver_mv_reflects_table_updates.py b/test/antithesis/workload/test/parallel_driver_mv_reflects_table_updates.py
index 876f5ff5a8e5e..635efe79fac69 100755
--- a/test/antithesis/workload/test/parallel_driver_mv_reflects_table_updates.py
+++ b/test/antithesis/workload/test/parallel_driver_mv_reflects_table_updates.py
@@ -25,8 +25,10 @@
   2. Picks a per-invocation prefix so concurrent driver instances scope to
      disjoint MV rows.
   3. INSERTs N rows tagged with the prefix.
-  4. Requests an Antithesis quiet period and polls the MV until the count
-     for the prefix equals N.
+  4. Polls the MV until the count for the prefix equals N. The global
+     fault-orchestrator service drives quiet/active windows on its own
+     cadence; this driver's catchup timeout is sized to span at least
+     one quiet window so the read can complete during it.
   5. Asserts:
        - `always(...)` the MV count matches what was inserted (no over- or
          under-counting after settle).
@@ -45,19 +47,21 @@
 import time
 
 import helper_random
-from antithesis.assertions import always, sometimes
 from helper_pg import execute_retry, query_one_retry
-from helper_quiet import request_quiet_period
 from helper_table_mv import MV_NAME, TABLE_MV_INPUT, ensure_table_and_mv
 
+from antithesis.assertions import always, sometimes
+
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
 )
 LOG = logging.getLogger("driver.mv_reflects_table_updates")
 
 INSERTS_PER_INVOCATION = 40
-QUIET_PERIOD_S = 20
-CATCHUP_TIMEOUT_S = 60.0
+# Sized to span at least one MAX_OFF window from the global fault-
+# orchestrator (default 40s) plus enough buffer for the MV catchup itself
+# during that window.
+CATCHUP_TIMEOUT_S = 90.0
 CATCHUP_POLL_INTERVAL_S = 0.5
 
 
@@ -97,8 +101,6 @@ def main() -> int:
         params,
     )
 
-    request_quiet_period(QUIET_PERIOD_S)
-
     # Poll the MV until the row_count for this prefix reaches N. The MV's
     # `COUNT(*) GROUP BY prefix` shape means the row for this prefix may
     # appear partially populated during the catchup window.
@@ -112,7 +114,7 @@ def main() -> int:
 
     sometimes(
         caught_up,
-        "mv: row_count caught up to inserted count after quiet period",
+        "mv: row_count caught up to inserted count within catchup budget",
         {
             "mv": MV_NAME,
             "table": TABLE_MV_INPUT,
diff --git a/test/antithesis/workload/test/parallel_driver_mysql_cdc.py b/test/antithesis/workload/test/parallel_driver_mysql_cdc.py
index 233207ff8e3c6..4537985fe64ca 100644
--- a/test/antithesis/workload/test/parallel_driver_mysql_cdc.py
+++ b/test/antithesis/workload/test/parallel_driver_mysql_cdc.py
@@ -20,7 +20,7 @@
      collide.
   3. Inserts ROWS_PER_INVOCATION rows to the MySQL primary, recording the
      expected {id → value} map locally.
-  4. Requests an Antithesis quiet period and polls the Materialize source
+  4. Polls the Materialize source
      table until all expected rows appear (or the budget expires).
   5. Asserts correctness via `always(...)` on count and per-row values.
      A `sometimes(...)` liveness anchor fires on successful catchup.
@@ -39,10 +39,10 @@
 
 import helper_mysql
 import helper_random
-from antithesis.assertions import always, sometimes
 from helper_mysql_source import SOURCE_NAME, TABLE_NAME
 from helper_pg import query_retry
-from helper_quiet import request_quiet_period
+
+from antithesis.assertions import always, sometimes
 
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
@@ -50,8 +50,10 @@
 LOG = logging.getLogger("driver.mysql_cdc")
 
 ROWS_PER_INVOCATION = 20
-QUIET_PERIOD_S = 25
-CATCHUP_TIMEOUT_S = 90.0
+# Sized to span at least one MAX_OFF window from the global fault-
+# orchestrator (default 40s) plus the time for replica → source → MZ
+# catchup itself, which can stretch under intermittent network faults.
+CATCHUP_TIMEOUT_S = 120.0
 POLL_INTERVAL_S = 1.0
 
 
@@ -174,16 +176,14 @@ def main() -> int:
         LOG.info("no rows inserted successfully this invocation; exiting cleanly")
         return 0
 
-    LOG.info("inserted %d rows; requesting quiet period", len(expected))
-    request_quiet_period(QUIET_PERIOD_S)
-
+    LOG.info("inserted %d rows; waiting for catchup", len(expected))
     caught_up = _wait_for_catchup(batch_id, len(expected))
 
     # Liveness anchor: at least one invocation should fully catch up. If this
     # never fires across an entire run the safety assertions below are vacuous.
     sometimes(
         caught_up,
-        "mysql: CDC source caught up to all primary inserts after quiet period",
+        "mysql: CDC source caught up to all primary inserts within catchup budget",
         {
             "source": TABLE_NAME,
             "batch_id": batch_id,
diff --git a/test/antithesis/workload/test/parallel_driver_mysql_myisam.py b/test/antithesis/workload/test/parallel_driver_mysql_myisam.py
index 00542bba536bc..a7075a276c365 100644
--- a/test/antithesis/workload/test/parallel_driver_mysql_myisam.py
+++ b/test/antithesis/workload/test/parallel_driver_mysql_myisam.py
@@ -40,7 +40,7 @@
   2. Picks a per-invocation `batch_id` prefix so concurrent drivers
      (including the InnoDB sibling) don't collide.
   3. Inserts ROWS_PER_INVOCATION rows to the MyISAM table on the primary.
-  4. Requests an Antithesis quiet period and polls the Materialize source
+  4. Polls the Materialize source
      table until all expected rows appear (or the budget expires).
   5. Asserts correctness via `always(...)` on count and per-row values.
 """
@@ -53,7 +53,6 @@
 
 import helper_mysql
 import helper_random
-from antithesis.assertions import always, sometimes
 from helper_mysql_source import (
     MYSQL_DATABASE,
     MYSQL_TABLE_MYISAM,
@@ -61,7 +60,8 @@
     TABLE_NAME_MYISAM,
 )
 from helper_pg import query_retry
-from helper_quiet import request_quiet_period
+
+from antithesis.assertions import always, sometimes
 
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
@@ -69,8 +69,9 @@
 LOG = logging.getLogger("driver.mysql_myisam")
 
 ROWS_PER_INVOCATION = 20
-QUIET_PERIOD_S = 25
-CATCHUP_TIMEOUT_S = 90.0
+# Sized to span at least one MAX_OFF window from the global fault-
+# orchestrator (default 40s) plus replica → source → MZ catchup time.
+CATCHUP_TIMEOUT_S = 120.0
 POLL_INTERVAL_S = 1.0
 
 
@@ -199,14 +200,12 @@ def main() -> int:
         LOG.info("no rows inserted successfully this invocation; exiting cleanly")
         return 0
 
-    LOG.info("inserted %d rows; requesting quiet period", len(expected))
-    request_quiet_period(QUIET_PERIOD_S)
-
+    LOG.info("inserted %d rows; waiting for catchup", len(expected))
     caught_up = _wait_for_catchup(batch_id, len(expected))
 
     sometimes(
         caught_up,
-        "mysql myisam: CDC source caught up to all primary inserts after quiet period",
+        "mysql myisam: CDC source caught up to all primary inserts within catchup budget",
         {
             "source": TABLE_NAME_MYISAM,
             "batch_id": batch_id,
diff --git a/test/antithesis/workload/test/parallel_driver_parallel_workload.py b/test/antithesis/workload/test/parallel_driver_parallel_workload.py
index 945fd0805e515..3e698eeb1e660 100644
--- a/test/antithesis/workload/test/parallel_driver_parallel_workload.py
+++ b/test/antithesis/workload/test/parallel_driver_parallel_workload.py
@@ -47,7 +47,6 @@
 
 import helper_random
 import psycopg
-from antithesis.assertions import always, sometimes
 from helper_pg import (
     PGDATABASE,
     PGHOST,
@@ -57,6 +56,7 @@
     PGUSER_INTERNAL,
 )
 
+from antithesis.assertions import always, sometimes
 from materialize.data_ingest.query_error import QueryError
 from materialize.parallel_workload import executor as _pw_executor
 from materialize.parallel_workload.action import (
@@ -258,9 +258,7 @@ def _matches_setup_tolerance(exc: BaseException) -> bool:
     signal).
     """
     msg = getattr(exc, "msg", None) or str(exc)
-    return any(
-        pat in msg for pat in (*_SETUP_RACE_PATTERNS, *_SETUP_FAULT_PATTERNS)
-    )
+    return any(pat in msg for pat in (*_SETUP_RACE_PATTERNS, *_SETUP_FAULT_PATTERNS))
 
 
 def _worker_death_tolerable(occurred: Exception | None) -> bool:
@@ -412,7 +410,9 @@ def _create_database_for_antithesis(database: Database, exe: Executor) -> None:
         "CREATE CONNECTION IF NOT EXISTS csr_conn FOR CONFLUENT SCHEMA "
         "REGISTRY URL 'http://schema-registry:8081'",
     )
-    _tolerate_setup_race(exe.execute, "CREATE SECRET IF NOT EXISTS minio AS 'minioadmin'")
+    _tolerate_setup_race(
+        exe.execute, "CREATE SECRET IF NOT EXISTS minio AS 'minioadmin'"
+    )
     _tolerate_setup_race(
         exe.execute,
         "CREATE CONNECTION IF NOT EXISTS aws_conn TO AWS ("
@@ -596,7 +596,11 @@ def _run_invocation(
                     dead = [t for t in threads if not t.is_alive()]
                     if dead:
                         occurred = next(
-                            (w.occurred_exception for w in workers if w.occurred_exception),
+                            (
+                                w.occurred_exception
+                                for w in workers
+                                if w.occurred_exception
+                            ),
                             None,
                         )
                         worker_failed = WorkerFailedException(
@@ -667,7 +671,9 @@ def _run_invocation(
         "parallel workload: worker thread death tolerated as fault-injection consequence",
         {
             "error": (
-                str(worker_failed.cause) if worker_failed and worker_failed.cause else None
+                str(worker_failed.cause)
+                if worker_failed and worker_failed.cause
+                else None
             ),
             "uncaptured": worker_failed is not None and worker_failed.cause is None,
         },
diff --git a/test/antithesis/workload/test/parallel_driver_strict_serializable_reads.py b/test/antithesis/workload/test/parallel_driver_strict_serializable_reads.py
index 19e7d1d698dbc..71cb339149018 100755
--- a/test/antithesis/workload/test/parallel_driver_strict_serializable_reads.py
+++ b/test/antithesis/workload/test/parallel_driver_strict_serializable_reads.py
@@ -37,7 +37,9 @@
          invariant.
        - `always(final >= max(count), …)` for the closing observation.
        - `sometimes(...)` liveness anchor confirming the closing
-         observation reached the inserted count after the quiet period.
+         observation reached the inserted count within the final-read
+         budget (which is sized to span at least one quiet window from
+         the global fault-orchestrator).
 
 Read failures (connect timeout, server unavailable mid-fault) are skipped
 rather than recorded — they are not regression evidence, and a False
@@ -57,7 +59,6 @@
 
 import helper_random
 import psycopg
-from antithesis.assertions import always, sometimes
 from helper_pg import (
     PGDATABASE,
     PGHOST,
@@ -65,17 +66,20 @@
     PGUSER,
     execute_retry,
 )
-from helper_quiet import request_quiet_period
 from helper_table_mv import MV_NAME, TABLE_MV_INPUT, ensure_table_and_mv
 
+from antithesis.assertions import always, sometimes
+
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
 )
 LOG = logging.getLogger("driver.strict_serializable_reads")
 
 STEPS_PER_INVOCATION = 12
-QUIET_PERIOD_S = 15
-FINAL_READ_TIMEOUT_S = 30.0
+# Sized to span at least one MAX_OFF window from the global fault-
+# orchestrator (default 40s) plus the time the final read needs after
+# the MV catches up.
+FINAL_READ_TIMEOUT_S = 90.0
 FINAL_READ_POLL_S = 0.5
 PROBE_CONNECT_TIMEOUT_S = 5
 
@@ -150,8 +154,8 @@ def main() -> int:
         observations.append((step, observed))
 
     # Settle and take the closing observation. The driver is short and the
-    # observations list is small, so a generous timeout here is fine.
-    request_quiet_period(QUIET_PERIOD_S)
+    # observations list is small, so a generous timeout here is fine — long
+    # enough to span at least one global-orchestrator quiet window.
     expected_final = len(observations) and observations[-1][0]
     # `expected_final` is the largest step that was actually INSERTed (we
     # may have bailed early). It's an *upper bound* on the count — the
diff --git a/test/antithesis/workload/test/parallel_driver_upsert_ancient_key_writable.py b/test/antithesis/workload/test/parallel_driver_upsert_ancient_key_writable.py
index 296bf115fd425..84a9a47369a50 100644
--- a/test/antithesis/workload/test/parallel_driver_upsert_ancient_key_writable.py
+++ b/test/antithesis/workload/test/parallel_driver_upsert_ancient_key_writable.py
@@ -59,10 +59,8 @@
 import sys
 
 import helper_random
-from antithesis.assertions import always, sometimes
 from helper_kafka import make_producer
 from helper_pg import query_retry
-from helper_quiet import request_quiet_period
 from helper_source_stats import wait_for_catchup
 from helper_upsert_source import (
     SOURCE_UPSERT_TEXT,
@@ -70,6 +68,8 @@
     ensure_upsert_text_source,
 )
 
+from antithesis.assertions import always, sometimes
+
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
 )
@@ -89,8 +89,10 @@
 # many short invocations rather than one big one.
 ANCIENT_KEYS_PER_INVOCATION = 5
 
-QUIET_PERIOD_S = 20
-CATCHUP_TIMEOUT_S = 60.0
+# Sized to span at least one MAX_OFF window from the global fault-
+# orchestrator (default 40s) plus the time the upsert source needs to
+# advance offset_committed past our produces.
+CATCHUP_TIMEOUT_S = 90.0
 
 
 def _produce(producer, tracker, topic: str, key: str, value: str) -> None:
@@ -188,13 +190,12 @@ def main() -> int:
         LOG.info("no produces confirmed; exiting cleanly")
         return 0
 
-    request_quiet_period(QUIET_PERIOD_S)
     caught_up = wait_for_catchup(
         SOURCE_UPSERT_TEXT, max_produced, timeout_s=CATCHUP_TIMEOUT_S
     )
     sometimes(
         caught_up,
-        "upsert: source caught up after cross-invocation produces",
+        "upsert: source caught up after cross-invocation produces within catchup budget",
         {"source": SOURCE_UPSERT_TEXT, "target_offset": max_produced},
     )
     if not caught_up:
diff --git a/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py b/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py
index 68734c7f03c82..7e3032258dee5 100755
--- a/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py
+++ b/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py
@@ -10,9 +10,9 @@
 
 """Antithesis driver for property `upsert-key-reflects-latest-value`.
 
-For each key produced to a Kafka UPSERT-envelope source, after a quiet period
-that lets Materialize catch up, the source's row for that key must reflect the
-last value produced — or be absent if the last message was a tombstone.
+For each key produced to a Kafka UPSERT-envelope source, once Materialize
+catches up, the source's row for that key must reflect the last value
+produced — or be absent if the last message was a tombstone.
 
 Each invocation:
   1. Ensures the upsert source exists (idempotent CREATE ... IF NOT EXISTS).
@@ -20,8 +20,9 @@
      interfere with each other's expected-state model.
   3. Produces a deterministic mix of upserts and tombstones, tracking the
      local "what should the source say" model.
-  4. Requests an Antithesis quiet period and waits for offset_committed to
-     reach the highest produced offset.
+  4. Waits for offset_committed to reach the highest produced offset. The
+     global fault-orchestrator drives quiet/active windows; this driver
+     just polls until catchup completes or the budget expires.
   5. For every tracked key, asserts that what's in the source matches the
      local model. Live keys use one assertion message, tombstoned keys use
      another, so triage can distinguish the two failure modes.
@@ -37,10 +38,8 @@
 import sys
 
 import helper_random
-from antithesis.assertions import always, sometimes
 from helper_kafka import make_producer
 from helper_pg import query_one_retry
-from helper_quiet import request_quiet_period
 from helper_source_stats import wait_for_catchup
 from helper_upsert_source import (
     SOURCE_UPSERT_TEXT,
@@ -48,6 +47,8 @@
     ensure_upsert_text_source,
 )
 
+from antithesis.assertions import always, sometimes
+
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
 )
@@ -67,8 +68,10 @@
 # budget on the same workload shape.
 TOMBSTONE_PROB_RANGE = (0.05, 0.50)
 
-QUIET_PERIOD_S = 20
-CATCHUP_TIMEOUT_S = 60.0
+# Sized to span at least one MAX_OFF window from the global fault-
+# orchestrator (default 40s) plus the time the upsert source needs to
+# advance offset_committed past our produces.
+CATCHUP_TIMEOUT_S = 90.0
 
 
 def _produce(producer, tracker, topic: str, key: str, value: str | None) -> None:
@@ -125,9 +128,7 @@ def main() -> int:
     # range. The fuzzer sees this as one of the first decisions in the
     # timeline and can drive it toward whichever extreme reveals a bug.
     tombstone_prob = helper_random.random_float(*TOMBSTONE_PROB_RANGE)
-    LOG.info(
-        "driver starting; prefix=%s tombstone_prob=%.3f", prefix, tombstone_prob
-    )
+    LOG.info("driver starting; prefix=%s tombstone_prob=%.3f", prefix, tombstone_prob)
 
     producer, tracker = make_producer(client_id=f"antithesis-{prefix}")
 
@@ -178,8 +179,9 @@ def main() -> int:
         LOG.info("no messages confirmed delivered this invocation; exiting cleanly")
         return 0
 
-    # Now ask Antithesis to pause faults and wait for Materialize to catch up.
-    request_quiet_period(QUIET_PERIOD_S)
+    # Wait for Materialize to catch up. Quiet windows are driven globally by
+    # the fault-orchestrator service; this catchup timeout is sized to span
+    # at least one such window so the source can advance during it.
     caught_up = wait_for_catchup(
         SOURCE_UPSERT_TEXT, max_produced, timeout_s=CATCHUP_TIMEOUT_S
     )
@@ -189,7 +191,7 @@ def main() -> int:
     # vacuous and the run is uninteresting.
     sometimes(
         caught_up,
-        "upsert: source caught up to produced offsets after quiet period",
+        "upsert: source caught up to produced offsets within catchup budget",
         {"source": SOURCE_UPSERT_TEXT, "target_offset": max_produced},
     )
 
diff --git a/test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py b/test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py
index 5612a19c30ea8..fd9c7cf389001 100755
--- a/test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py
+++ b/test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py
@@ -53,7 +53,6 @@
 
 import helper_random
 import psycopg
-from antithesis.assertions import always, sometimes
 from helper_pg import (
     PGDATABASE,
     PGHOST,
@@ -63,6 +62,8 @@
     query_retry,
 )
 
+from antithesis.assertions import always, sometimes
+
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
 )
diff --git a/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py b/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py
index 3c9876ba79988..3be0de672c626 100755
--- a/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py
+++ b/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py
@@ -25,8 +25,10 @@
 Each cycle:
   1. Produce a batch of (key, value) and (key, null) messages, updating the
      in-memory `expected_state` model.
-  2. Request a quiet period and wait for `offset_committed` to reach the
-     highest produced offset.
+  2. Wait for `offset_committed` to reach the highest produced offset.
+     The global fault-orchestrator drives quiet/active windows on its
+     own cadence; the per-cycle catchup timeout is sized to span at
+     least one quiet window so settle has somewhere to land.
   3. SELECT every tracked key's current source state and assert it matches
      `expected_state` via `always("upsert: rehydrated state equals
      local model", ...)`. Across-cycle stability is exactly what
@@ -38,15 +40,15 @@
 
 A previous version of this driver also recorded a "clusterd observed
 non-online" `sometimes` anchor via a once-per-cycle SELECT of
-`mz_internal.mz_cluster_replica_statuses`. That assertion was structurally
-unable to fire here: each cycle requests a 25-second Antithesis quiet
-period before its assertions, the probe runs *after* the quiet period
-(when faults are paused and killed containers have been restored), and
-the introspection view itself lags clusterd death by the
-orchestrator-process 5-second poll. The "did we see a replica go
-offline" signal lives in `anytime_fault_recovery_exercised.py` instead,
-which polls continuously and never requests a quiet period, so it has
-the right shape to observe the offline window.
+`mz_internal.mz_cluster_replica_statuses`. That assertion was
+structurally unable to fire here: when faults are paused (either by
+the old per-driver `ANTITHESIS_STOP_FAULTS` calls or by the new global
+fault-orchestrator's quiet window) killed containers are restored
+before the probe runs, and the introspection view itself lags clusterd
+death by the orchestrator-process 5-second poll. The "did we see a
+replica go offline" signal lives in `anytime_fault_recovery_exercised.py`
+instead, which polls continuously and is unaffected by quiet windows,
+so it has the right shape to observe the offline window.
 
 Distinct prefix per timeline keeps multiple parallel timelines independent.
 """
@@ -58,10 +60,8 @@
 import time
 
 import helper_random
-from antithesis.assertions import always, sometimes
 from helper_kafka import make_producer
 from helper_pg import query_one_retry
-from helper_quiet import request_quiet_period
 from helper_source_stats import wait_for_catchup
 from helper_upsert_source import (
     SOURCE_UPSERT_TEXT,
@@ -69,6 +69,8 @@
     ensure_upsert_text_source,
 )
 
+from antithesis.assertions import always, sometimes
+
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
 )
@@ -90,8 +92,10 @@
 # rehydration, not just per-cycle convergence.
 TOMBSTONE_PROB_RANGE = (0.05, 0.50)
 
-QUIET_PERIOD_S = 25
-CATCHUP_TIMEOUT_S = 120.0
+# Sized to span at least one MAX_OFF window from the global fault-
+# orchestrator (default 40s) and survive a clusterd restart inside it;
+# rehydration after a kill is the whole point of this driver.
+CATCHUP_TIMEOUT_S = 180.0
 INTER_CYCLE_SLEEP_S = 2.0
 
 
@@ -167,7 +171,8 @@ def _run_cycle(
         LOG.info("cycle %d: no messages confirmed delivered; skipping", cycle_idx)
         return False
 
-    request_quiet_period(QUIET_PERIOD_S)
+    # The global fault-orchestrator drives quiet windows; this catchup
+    # timeout is sized to span one and survive a clusterd kill in it.
     caught_up = wait_for_catchup(
         SOURCE_UPSERT_TEXT, max_produced, timeout_s=CATCHUP_TIMEOUT_S
     )
@@ -234,7 +239,7 @@ def main() -> int:
     # The "did this run actually span a clusterd restart" anchor is
     # deliberately not in this driver — see the module docstring. The
     # `cycles_run >= 2` check below is the rehydration-coverage anchor:
-    # without two post-quiet-period reads, the safety assertions could
+    # without two settle-then-read cycles, the safety assertions could
     # be vacuously satisfied by a single early settle.
     sometimes(
         cycles_run >= 2,

From cc65e8eee969ee56a0a594d4ef172e8b5ab9485f Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Thu, 14 May 2026 17:23:25 -0400
Subject: [PATCH 58/65] test/antithesis: bump connect/retry timeouts to span
 fault-orchestrator windows
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The global fault-orchestrator alternates faults-ON/OFF windows of up
to MAX_ON / MAX_OFF seconds each (defaults 40s, set in the
FaultOrchestrator service). With the previous timeouts a single
connect attempt or producer flush could expire entirely inside one
40s faults-ON window — fast-failing before the orchestrator opened
the next quiet window and burning retry budget on TCP timeouts.

helper_pg:
  CONNECT_TIMEOUT_S  15 -> 30   (renamed from _CONNECT_TIMEOUT_S so
                                 the parallel-workload driver can
                                 reuse it instead of hardcoding 15)
  _RETRY_BUDGET_S    120 -> 180  (spans one full ON+OFF cycle + margin)

helper_mysql: same logic — same values. Replication adds primary→replica
hops so the budgets match helper_pg's. `wait_for_host`'s 5s probe stays
short: it runs at bootstrap before fault injection begins.

helper_kafka: new explicit librdkafka producer config —
`request.timeout.ms=60000`, `delivery.timeout.ms=180000`. New module-
level `ADMIN_TIMEOUT_S=90` for `admin.list_topics` and `create_topics`
result waits; new `FLUSH_TIMEOUT_S=90` exported for drivers so
`producer.flush(timeout=...)` waits past a single MAX_ON window before
declaring pending messages "skipping assertions" material.

Per-driver direct psycopg.connect in parallel_driver_parallel_workload
(3 sites) now use `CONNECT_TIMEOUT_S` instead of literal 15. The four
Kafka-source drivers' `producer.flush(timeout=30)` calls now use
`FLUSH_TIMEOUT_S` from helper_kafka.

Probe timeouts are intentionally kept short — they exist to *measure*
unavailability, not wait through it:
  anytime_fault_recovery_exercised.PROBE_CONNECT_TIMEOUT_S = 2.0
  singleton_driver_catalog_recovery_consistency.PROBE_CONNECT_TIMEOUT_S = 2.0
  parallel_driver_strict_serializable_reads.PROBE_CONNECT_TIMEOUT_S = 5

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 test/antithesis/workload/test/helper_kafka.py | 36 +++++++++++++++++--
 test/antithesis/workload/test/helper_mysql.py | 12 +++++--
 test/antithesis/workload/test/helper_pg.py    | 29 +++++++++------
 .../parallel_driver_kafka_none_envelope.py    |  4 +--
 .../test/parallel_driver_parallel_workload.py |  7 ++--
 ...llel_driver_upsert_ancient_key_writable.py |  4 +--
 .../parallel_driver_upsert_latest_value.py    |  4 +--
 ...ngleton_driver_upsert_state_rehydration.py |  4 +--
 8 files changed, 74 insertions(+), 26 deletions(-)

diff --git a/test/antithesis/workload/test/helper_kafka.py b/test/antithesis/workload/test/helper_kafka.py
index a9bf2eac600a1..3486ce79454cd 100644
--- a/test/antithesis/workload/test/helper_kafka.py
+++ b/test/antithesis/workload/test/helper_kafka.py
@@ -28,6 +28,34 @@
 
 BROKER = os.environ.get("KAFKA_BROKER", "kafka:9092")
 
+# Per-RPC and per-delivery timeouts for librdkafka. Default
+# `request.timeout.ms` is 30s, which can fail entirely inside a single
+# faults-ON window (MAX_ON defaults to 40s in the global fault-
+# orchestrator). Bumping it gives one request a real chance of spanning
+# the transition into the next quiet window before failing. librdkafka
+# also requires `delivery.timeout.ms` to be >= `request.timeout.ms +
+# linger.ms`; we pin both explicitly so the relationship is reviewable
+# here rather than implicit. `delivery.timeout.ms` is the wall-clock
+# budget the broker side of the producer has to either deliver or fail
+# the message; idempotent retries happen under this umbrella, so the
+# value needs to span at least one full ON+OFF cycle (~80s) plus
+# margin.
+_REQUEST_TIMEOUT_MS = 60_000
+_DELIVERY_TIMEOUT_MS = 180_000
+
+# Wall-clock budget for synchronous admin / flush waits. The orchestrator's
+# longest faults-ON window is MAX_ON (40s default); 90s comfortably spans
+# one such window plus catchup overhead.
+ADMIN_TIMEOUT_S = 90
+
+# Wall-clock budget for `producer.flush(timeout=...)` in drivers. Tuned to
+# absorb at least one MAX_ON window so a produce burst that landed mid-
+# fault still has time to drain after the orchestrator opens its next
+# quiet window. Shorter than `_DELIVERY_TIMEOUT_MS` so a flush that
+# returns with `pending > 0` is a strong signal the producer is still
+# struggling, not that we just ran out of patience.
+FLUSH_TIMEOUT_S = 90
+
 
 @dataclass
 class DeliveryTracker:
@@ -62,6 +90,10 @@ def make_producer(client_id: str | None = None) -> tuple[Producer, DeliveryTrack
         "linger.ms": 5,
         "enable.idempotence": True,
         "acks": "all",
+        # See module-level _REQUEST_TIMEOUT_MS / _DELIVERY_TIMEOUT_MS for
+        # the fault-orchestrator-aware rationale on these values.
+        "request.timeout.ms": _REQUEST_TIMEOUT_MS,
+        "delivery.timeout.ms": _DELIVERY_TIMEOUT_MS,
     }
     if client_id:
         config["client.id"] = client_id
@@ -71,7 +103,7 @@ def make_producer(client_id: str | None = None) -> tuple[Producer, DeliveryTrack
 def ensure_topic(topic: str, num_partitions: int = 1) -> None:
     """Create the topic if it doesn't already exist. No-op on race with auto-create."""
     admin = AdminClient({"bootstrap.servers": BROKER})
-    existing = admin.list_topics(timeout=10).topics
+    existing = admin.list_topics(timeout=ADMIN_TIMEOUT_S).topics
     if topic in existing:
         return
     LOG.info("creating kafka topic %s with %d partition(s)", topic, num_partitions)
@@ -80,7 +112,7 @@ def ensure_topic(topic: str, num_partitions: int = 1) -> None:
     )
     for t, fut in futures.items():
         try:
-            fut.result(timeout=30)
+            fut.result(timeout=ADMIN_TIMEOUT_S)
         except KafkaException as exc:
             # TOPIC_ALREADY_EXISTS = 36
             err = exc.args[0] if exc.args else None
diff --git a/test/antithesis/workload/test/helper_mysql.py b/test/antithesis/workload/test/helper_mysql.py
index f9b79395c556a..35f573186b8f7 100644
--- a/test/antithesis/workload/test/helper_mysql.py
+++ b/test/antithesis/workload/test/helper_mysql.py
@@ -30,7 +30,15 @@
 MYSQL_PORT = int(os.environ.get("MYSQL_PORT", "3306"))
 MYSQL_PASSWORD = os.environ.get("MYSQL_PASSWORD", "p@ssw0rd")
 
-_RETRY_BUDGET_S = 120
+# See helper_pg for the rationale on these values. The global fault-
+# orchestrator's MAX_ON/MAX_OFF defaults (40s each) mean a per-attempt
+# connect_timeout shorter than ~MAX_ON will fast-fail entirely inside a
+# faults-ON window, and a retry budget shorter than ~one full ON+OFF cycle
+# won't give an attempt a chance to land in the next quiet window. MySQL
+# also adds the primary→replica replication path, so the budget is sized
+# the same as helper_pg's.
+_CONNECT_TIMEOUT_S = 30
+_RETRY_BUDGET_S = 180
 _RETRY_INITIAL_S = 0.5
 _RETRY_MAX_S = 4.0
 
@@ -51,7 +59,7 @@ def _open(host: str, database: str) -> pymysql.connections.Connection:
                 user="root",
                 password=MYSQL_PASSWORD,
                 database=database,
-                connect_timeout=15,
+                connect_timeout=_CONNECT_TIMEOUT_S,
                 autocommit=True,
             )
         except Exception as exc:  # noqa: BLE001
diff --git a/test/antithesis/workload/test/helper_pg.py b/test/antithesis/workload/test/helper_pg.py
index 59a88f1963ab3..ac6fada801506 100644
--- a/test/antithesis/workload/test/helper_pg.py
+++ b/test/antithesis/workload/test/helper_pg.py
@@ -38,15 +38,22 @@
 # Retry tuning. Antithesis injects partitions and node hangs; conservative bounds
 # keep drivers progressing without masking real correctness signals.
 #
-# These need to absorb a full Antithesis quiet period plus restart time for the
-# system to come back. Quiet-period requests in the workload are typically
-# 20-25s; the container then takes a few seconds to become responsive, so the
-# overall budget must comfortably exceed ~30s. The per-attempt connect timeout
-# also has to be long enough to actually complete a TCP+TLS handshake against
-# a hung but recovering materialized — too short and every attempt fails fast
-# and the budget is burned without giving the system a chance to answer.
-_CONNECT_TIMEOUT_S = 15
-_RETRY_BUDGET_S = 120
+# The global fault-orchestrator alternates faults-ON/OFF windows of up to
+# MAX_ON / MAX_OFF seconds each (defaults 40s, defined in
+# test/antithesis/mzcompose.py FaultOrchestrator). One full
+# fault-ON+fault-OFF cycle is up to MAX_ON+MAX_OFF ~= 80s.
+#
+# Per-attempt connect_timeout must be long enough that an attempt starting
+# late in a faults-ON window has a real chance of completing across the
+# transition into the next faults-OFF window. A 15s timeout entirely inside
+# a 40s faults-ON window fast-fails before the orchestrator opens a quiet
+# period, burning retry budget on TCP timeouts rather than waiting for
+# materialized to be reachable.
+#
+# Retry budget must comfortably span at least one full ON+OFF cycle plus
+# margin for the system to actually respond once faults pause.
+CONNECT_TIMEOUT_S = 30
+_RETRY_BUDGET_S = 180
 _RETRY_INITIAL_S = 0.1
 _RETRY_MAX_S = 2.0
 
@@ -72,7 +79,7 @@ def connect(autocommit: bool = True) -> Iterator[psycopg.Connection]:
                 port=PGPORT,
                 user=PGUSER,
                 dbname=PGDATABASE,
-                connect_timeout=_CONNECT_TIMEOUT_S,
+                connect_timeout=CONNECT_TIMEOUT_S,
                 autocommit=autocommit,
             )
             break
@@ -169,7 +176,7 @@ def execute_internal_retry(sql: str, params: Sequence[Any] | None = None) -> Non
                     port=PGPORT_INTERNAL,
                     user=PGUSER_INTERNAL,
                     dbname=PGDATABASE,
-                    connect_timeout=_CONNECT_TIMEOUT_S,
+                    connect_timeout=CONNECT_TIMEOUT_S,
                     autocommit=True,
                 ) as conn,
                 conn.cursor() as cur,
diff --git a/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py b/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py
index 656a6b6b6d776..cfda086d9acc7 100755
--- a/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py
+++ b/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py
@@ -47,7 +47,7 @@
 import sys
 
 import helper_random
-from helper_kafka import make_producer
+from helper_kafka import FLUSH_TIMEOUT_S, make_producer
 from helper_none_source import (
     SOURCE_NONE_TEXT,
     TOPIC_NONE_TEXT,
@@ -95,7 +95,7 @@ def main() -> int:
         expected_payloads.add(payload)
         producer.poll(0)
 
-    pending = producer.flush(timeout=30)
+    pending = producer.flush(timeout=FLUSH_TIMEOUT_S)
     if pending > 0 or tracker.last_error is not None:
         # Same fail-closed pattern as the upsert driver: under sustained
         # fault injection we cannot prove which messages Kafka accepted, so
diff --git a/test/antithesis/workload/test/parallel_driver_parallel_workload.py b/test/antithesis/workload/test/parallel_driver_parallel_workload.py
index 3e698eeb1e660..6a73f39b4be62 100644
--- a/test/antithesis/workload/test/parallel_driver_parallel_workload.py
+++ b/test/antithesis/workload/test/parallel_driver_parallel_workload.py
@@ -48,6 +48,7 @@
 import helper_random
 import psycopg
 from helper_pg import (
+    CONNECT_TIMEOUT_S,
     PGDATABASE,
     PGHOST,
     PGPORT,
@@ -141,7 +142,7 @@ def _prepare_system(num_threads: int) -> None:
             user=PGUSER_INTERNAL,
             dbname=PGDATABASE,
             autocommit=True,
-            connect_timeout=15,
+            connect_timeout=CONNECT_TIMEOUT_S,
         ) as conn,
         conn.cursor() as cur,
     ):
@@ -334,7 +335,7 @@ def _drop_seed_scoped_objects(seed: str) -> None:
                 user=PGUSER,
                 dbname=PGDATABASE,
                 autocommit=True,
-                connect_timeout=15,
+                connect_timeout=CONNECT_TIMEOUT_S,
             ) as conn,
             conn.cursor() as cur,
         ):
@@ -579,7 +580,7 @@ def _run_invocation(
                     user=PGUSER,
                     dbname=PGDATABASE,
                     autocommit=True,
-                    connect_timeout=15,
+                    connect_timeout=CONNECT_TIMEOUT_S,
                 ) as setup_conn,
                 setup_conn.cursor() as setup_cur,
             ):
diff --git a/test/antithesis/workload/test/parallel_driver_upsert_ancient_key_writable.py b/test/antithesis/workload/test/parallel_driver_upsert_ancient_key_writable.py
index 84a9a47369a50..8ea69f67c04b1 100644
--- a/test/antithesis/workload/test/parallel_driver_upsert_ancient_key_writable.py
+++ b/test/antithesis/workload/test/parallel_driver_upsert_ancient_key_writable.py
@@ -59,7 +59,7 @@
 import sys
 
 import helper_random
-from helper_kafka import make_producer
+from helper_kafka import FLUSH_TIMEOUT_S, make_producer
 from helper_pg import query_retry
 from helper_source_stats import wait_for_catchup
 from helper_upsert_source import (
@@ -173,7 +173,7 @@ def main() -> int:
         _produce(producer, tracker, TOPIC_UPSERT_TEXT, key, new_value)
         producer.poll(0)
 
-    pending = producer.flush(timeout=30)
+    pending = producer.flush(timeout=FLUSH_TIMEOUT_S)
     if pending > 0 or tracker.last_error is not None:
         # Under sustained fault injection we can't prove which produces
         # Kafka accepted. Bail before asserting — "writes that landed got
diff --git a/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py b/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py
index 7e3032258dee5..125e71b7c114f 100755
--- a/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py
+++ b/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py
@@ -38,7 +38,7 @@
 import sys
 
 import helper_random
-from helper_kafka import make_producer
+from helper_kafka import FLUSH_TIMEOUT_S, make_producer
 from helper_pg import query_one_retry
 from helper_source_stats import wait_for_catchup
 from helper_upsert_source import (
@@ -159,7 +159,7 @@ def main() -> int:
 
     # Flush all pending deliveries. We poll callbacks while flushing so the
     # tracker reflects the true max produced offset.
-    pending = producer.flush(timeout=30)
+    pending = producer.flush(timeout=FLUSH_TIMEOUT_S)
     if pending > 0 or tracker.last_error is not None:
         # Under sustained fault injection we cannot prove which of the just-
         # produced messages Kafka actually accepted, so `expected` may name
diff --git a/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py b/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py
index 3be0de672c626..58e1de5c18ac4 100755
--- a/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py
+++ b/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py
@@ -60,7 +60,7 @@
 import time
 
 import helper_random
-from helper_kafka import make_producer
+from helper_kafka import FLUSH_TIMEOUT_S, make_producer
 from helper_pg import query_one_retry
 from helper_source_stats import wait_for_catchup
 from helper_upsert_source import (
@@ -156,7 +156,7 @@ def _run_cycle(
             expected[key] = value
         producer.poll(0)
 
-    pending = producer.flush(timeout=30)
+    pending = producer.flush(timeout=FLUSH_TIMEOUT_S)
     if pending > 0 or tracker.last_error is not None:
         LOG.info(
             "cycle %d: skipping assertions; flush pending=%d last_error=%s",

From bb7c5cb924fedb522dbde7ee7d8b05534306fbd5 Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Thu, 14 May 2026 18:42:00 -0400
Subject: [PATCH 59/65] test/antithesis: fault-orchestrator: bash -s -> bash -c
 so script actually runs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The FaultOrchestrator service was wired up with
`entrypoint: ["bash", "-s"]` and the script body passed as `command`.
But `bash -s` reads commands from stdin — and there's no stdin in a
detached docker container, so bash exited immediately with no output
and the script string was silently used as `$0`.

Net effect: the orchestrator container started, exited cleanly, and
ANTITHESIS_STOP_FAULTS was never called. Antithesis fault injection
ran unconstrained for the entire run, with no quiet windows ever
opening. Every driver that needed more than one connection (the four
Kafka drivers do CREATE CONNECTION + admin metadata fetch + CREATE
SOURCE; the two MySQL drivers do primary writes + MZ reads; the
parallel-workload driver does multi-step setup; the strict-
serializable driver opens many fresh psycopg connects) effectively
starved.

Only `parallel_driver_mv_reflects_table_updates.py` ever reached its
"driver done" log line: it does one batched INSERT and then polls
materialize on a single retried connection, so a brief calm in the
faults occasionally let it through.

Fix: use `bash -c <script>`, which actually runs the script string.
Verified locally that the round-tripped YAML script body executes
under bash -c, including the no-op exit when ANTITHESIS_STOP_FAULTS
is unset.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 test/antithesis/config/docker-compose.yaml |  2 +-
 test/antithesis/mzcompose.py               | 11 ++++++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/test/antithesis/config/docker-compose.yaml b/test/antithesis/config/docker-compose.yaml
index ecd6eea161e07..cf9a35d147c97 100644
--- a/test/antithesis/config/docker-compose.yaml
+++ b/test/antithesis/config/docker-compose.yaml
@@ -623,7 +623,7 @@ services:
     image: bash:5
     entrypoint:
     - bash
-    - -s
+    - -c
     command:
     - "#!/usr/bin/env bash\n\n# Copyright Materialize, Inc. and contributors. All\
       \ rights reserved.\n#\n# Use of this software is governed by the Business Source\
diff --git a/test/antithesis/mzcompose.py b/test/antithesis/mzcompose.py
index 0f333afeee8e4..9fef5db40faed 100644
--- a/test/antithesis/mzcompose.py
+++ b/test/antithesis/mzcompose.py
@@ -120,9 +120,14 @@ def __init__(self) -> None:
             # `sleep` via busybox — everything the script uses. Public
             # image, so it sails through export-compose.py untouched.
             "image": "bash:5",
-            # `bash -s` reads the script from stdin via a here-string;
-            # keeps the YAML readable instead of one giant `-c` blob.
-            "entrypoint": ["bash", "-s"],
+            # `bash -c <script>` runs the script string. Earlier this used
+            # `bash -s`, which reads commands from stdin — in a docker
+            # container there's nothing on stdin so bash exited cleanly
+            # with no output and the orchestrator silently no-op'd. Net
+            # effect: Antithesis fault injection ran unconstrained with
+            # no quiet windows ever opening, which starved every driver
+            # that needed more than one connection.
+            "entrypoint": ["bash", "-c"],
             "command": [script],
             "environment": [
                 # Defaults chosen so MAX_ON stays well under the smallest

From 81413cf35ba377a31899884a92be83cb2f90333c Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Thu, 14 May 2026 19:54:23 -0400
Subject: [PATCH 60/65] test/antithesis: lifecycle logging + per-invocation
 correlation IDs

Debugging the recent fault-orchestrator no-op required staring at logs
without a way to tell which of N concurrent driver invocations a given
line came from, and the helpers logged so little that "stuck at
ensure_kafka_connection" looked the same as "stuck at the CREATE SOURCE
broker validation" or "stuck on the kafka admin metadata fetch."

Two changes:

1. helper_logging.py mints a short hex `INVOCATION_ID` once per process
   and stamps it into every log record via the root formatter
   (`[inv=<id>]`). Every driver swaps its bare `logging.basicConfig`
   block for `helper_logging.setup_logging(name)`, so grepping
   `inv=a3f9b1c2` isolates one invocation's records across helpers,
   threads, and subprocesses.

2. Helpers grow start/done lifecycle logs with elapsed timings at every
   meaningful checkpoint:

   helper_pg.connect             start, established/giving up + elapsed
   helper_pg.execute_retry       per-statement, with truncated SQL summary
   helper_pg.query_retry         same shape, plus row count on success
   helper_pg.execute_internal    same shape
   helper_mysql._open            per-host start/established/giving up
   helper_kafka.make_producer    config snapshot at construction
   helper_kafka.ensure_topic     probing / list_topics done / creating / created
   helper_upsert_source          per-phase markers around ensure_kafka_connection
                                 and ensure_upsert_text_source
   helper_none_source            same
   helper_table_mv               per-phase markers
   helper_source_stats           wait_for_catchup start, progress, done/timeout

   Retry attempts get attempt count + per-attempt elapsed + budget used,
   so "still trying" vs. "burning the budget" is visible in one line.

`bin/pyactivate -m ruff check --fix` cleaned up the unused `import
logging` statements that fell out of the basicConfig removal.

LOG_LEVEL env var (default INFO) lets a triage user re-run with
`LOG_LEVEL=DEBUG` to surface the per-statement SQL summaries the helpers
emit at DEBUG without flooding INFO.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../test/anytime_fault_recovery_exercised.py  |  10 +-
 .../test/anytime_kafka_frontier_monotonic.py  |  10 +-
 ..._kafka_offset_known_not_below_committed.py |  10 +-
 ...nytime_kafka_source_resumes_after_fault.py |  10 +-
 .../anytime_mysql_source_no_gtid_errors.py    |  10 +-
 .../test/first_mysql_replica_setup.py         |  14 +-
 .../first_select_upsert_implementation.py     |  10 +-
 test/antithesis/workload/test/helper_kafka.py |  45 +++++-
 .../workload/test/helper_logging.py           |  69 +++++++++
 test/antithesis/workload/test/helper_mysql.py |  40 +++++-
 .../workload/test/helper_none_source.py       |   9 +-
 test/antithesis/workload/test/helper_pg.py    | 136 ++++++++++++++++--
 .../workload/test/helper_source_stats.py      |  19 ++-
 .../workload/test/helper_table_mv.py          |   8 +-
 .../workload/test/helper_upsert_source.py     |  17 ++-
 .../parallel_driver_kafka_none_envelope.py    |   7 +-
 ...rallel_driver_mv_reflects_table_updates.py |   7 +-
 .../test/parallel_driver_mysql_cdc.py         |   7 +-
 .../test/parallel_driver_mysql_myisam.py      |   7 +-
 .../test/parallel_driver_parallel_workload.py |   7 +-
 ...rallel_driver_strict_serializable_reads.py |   7 +-
 ...llel_driver_upsert_ancient_key_writable.py |   7 +-
 .../parallel_driver_upsert_latest_value.py    |   7 +-
 ...ton_driver_catalog_recovery_consistency.py |   7 +-
 ...ngleton_driver_upsert_state_rehydration.py |   7 +-
 25 files changed, 372 insertions(+), 115 deletions(-)
 create mode 100644 test/antithesis/workload/test/helper_logging.py

diff --git a/test/antithesis/workload/test/anytime_fault_recovery_exercised.py b/test/antithesis/workload/test/anytime_fault_recovery_exercised.py
index 65f3ed4f695f0..e3e38862e0e19 100755
--- a/test/antithesis/workload/test/anytime_fault_recovery_exercised.py
+++ b/test/antithesis/workload/test/anytime_fault_recovery_exercised.py
@@ -39,13 +39,12 @@
 
 from __future__ import annotations
 
-import logging
 import os
 import sys
 import time
 
+import helper_logging
 import psycopg
-from antithesis.assertions import sometimes
 from helper_pg import (
     PGDATABASE,
     PGHOST,
@@ -54,10 +53,9 @@
     query_one_retry,
 )
 
-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
-)
-LOG = logging.getLogger("driver.fault_recovery_exercised")
+from antithesis.assertions import sometimes
+
+LOG = helper_logging.setup_logging("driver.fault_recovery_exercised")
 
 POLL_INTERVAL_S = 0.5
 RUN_BUDGET_S = 30.0
diff --git a/test/antithesis/workload/test/anytime_kafka_frontier_monotonic.py b/test/antithesis/workload/test/anytime_kafka_frontier_monotonic.py
index efd906a725844..09af679d04725 100755
--- a/test/antithesis/workload/test/anytime_kafka_frontier_monotonic.py
+++ b/test/antithesis/workload/test/anytime_kafka_frontier_monotonic.py
@@ -38,18 +38,16 @@
 
 from __future__ import annotations
 
-import logging
 import sys
 import time
 
-from antithesis.assertions import always
+import helper_logging
 from helper_pg import query_retry
 from helper_source_stats import offset_committed
 
-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
-)
-LOG = logging.getLogger("driver.kafka_frontier_monotonic")
+from antithesis.assertions import always
+
+LOG = helper_logging.setup_logging("driver.kafka_frontier_monotonic")
 
 # Knobs.
 POLL_INTERVAL_S = 0.5
diff --git a/test/antithesis/workload/test/anytime_kafka_offset_known_not_below_committed.py b/test/antithesis/workload/test/anytime_kafka_offset_known_not_below_committed.py
index a8d6be62ae6a9..fbac4205239d2 100755
--- a/test/antithesis/workload/test/anytime_kafka_offset_known_not_below_committed.py
+++ b/test/antithesis/workload/test/anytime_kafka_offset_known_not_below_committed.py
@@ -36,17 +36,15 @@
 
 from __future__ import annotations
 
-import logging
 import sys
 import time
 
-from antithesis.assertions import always
+import helper_logging
 from helper_pg import query_retry
 
-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
-)
-LOG = logging.getLogger("driver.kafka_offset_known_not_below_committed")
+from antithesis.assertions import always
+
+LOG = helper_logging.setup_logging("driver.kafka_offset_known_not_below_committed")
 
 POLL_INTERVAL_S = 0.5
 RUN_BUDGET_S = 30.0
diff --git a/test/antithesis/workload/test/anytime_kafka_source_resumes_after_fault.py b/test/antithesis/workload/test/anytime_kafka_source_resumes_after_fault.py
index 9c10879bd8291..d2219f4555bdb 100755
--- a/test/antithesis/workload/test/anytime_kafka_source_resumes_after_fault.py
+++ b/test/antithesis/workload/test/anytime_kafka_source_resumes_after_fault.py
@@ -45,18 +45,16 @@
 
 from __future__ import annotations
 
-import logging
 import os
 import sys
 import time
 
-from antithesis.assertions import reachable, sometimes
+import helper_logging
 from helper_pg import query_one_retry, query_retry
 
-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
-)
-LOG = logging.getLogger("driver.kafka_source_resumes_after_fault")
+from antithesis.assertions import reachable, sometimes
+
+LOG = helper_logging.setup_logging("driver.kafka_source_resumes_after_fault")
 
 POLL_INTERVAL_S = 1.0
 RUN_BUDGET_S = 45.0
diff --git a/test/antithesis/workload/test/anytime_mysql_source_no_gtid_errors.py b/test/antithesis/workload/test/anytime_mysql_source_no_gtid_errors.py
index 264ad67584f1a..e10245d3d3e31 100644
--- a/test/antithesis/workload/test/anytime_mysql_source_no_gtid_errors.py
+++ b/test/antithesis/workload/test/anytime_mysql_source_no_gtid_errors.py
@@ -41,18 +41,16 @@
 
 from __future__ import annotations
 
-import logging
 import sys
 import time
 
-from antithesis.assertions import always
+import helper_logging
 from helper_mysql_source import SOURCE_NAME
 from helper_pg import query_retry
 
-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
-)
-LOG = logging.getLogger("driver.mysql_source_no_gtid_errors")
+from antithesis.assertions import always
+
+LOG = helper_logging.setup_logging("driver.mysql_source_no_gtid_errors")
 
 # Knobs.
 POLL_INTERVAL_S = 2.0
diff --git a/test/antithesis/workload/test/first_mysql_replica_setup.py b/test/antithesis/workload/test/first_mysql_replica_setup.py
index f697334beb51f..2330e7df6c85f 100644
--- a/test/antithesis/workload/test/first_mysql_replica_setup.py
+++ b/test/antithesis/workload/test/first_mysql_replica_setup.py
@@ -25,18 +25,16 @@
 
 from __future__ import annotations
 
-import logging
 import sys
 import time
 
+import helper_logging
 import helper_mysql
-from antithesis.assertions import reachable, sometimes
 from helper_mysql_source import ensure_mysql_cdc_source
 
-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
-)
-LOG = logging.getLogger("first.mysql_replica_setup")
+from antithesis.assertions import reachable, sometimes
+
+LOG = helper_logging.setup_logging("first.mysql_replica_setup")
 
 
 def setup_primary() -> None:
@@ -80,7 +78,9 @@ def setup_primary() -> None:
         """,
         database="antithesis",
     )
-    LOG.info("antithesis.cdc_test (InnoDB) and cdc_test_myisam (MyISAM) ready on primary")
+    LOG.info(
+        "antithesis.cdc_test (InnoDB) and cdc_test_myisam (MyISAM) ready on primary"
+    )
 
 
 def configure_replica() -> None:
diff --git a/test/antithesis/workload/test/first_select_upsert_implementation.py b/test/antithesis/workload/test/first_select_upsert_implementation.py
index 584f40da7812c..b334efd1bbcf2 100755
--- a/test/antithesis/workload/test/first_select_upsert_implementation.py
+++ b/test/antithesis/workload/test/first_select_upsert_implementation.py
@@ -25,17 +25,15 @@
 
 from __future__ import annotations
 
-import logging
 import sys
 
+import helper_logging
 import helper_random
-from antithesis.assertions import sometimes
 from helper_pg import execute_internal_retry
 
-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
-)
-LOG = logging.getLogger("first.select_upsert_implementation")
+from antithesis.assertions import sometimes
+
+LOG = helper_logging.setup_logging("first.select_upsert_implementation")
 
 
 def main() -> int:
diff --git a/test/antithesis/workload/test/helper_kafka.py b/test/antithesis/workload/test/helper_kafka.py
index 3486ce79454cd..ca3bfbc15f613 100644
--- a/test/antithesis/workload/test/helper_kafka.py
+++ b/test/antithesis/workload/test/helper_kafka.py
@@ -19,6 +19,7 @@
 import logging
 import os
 import threading
+import time
 from dataclasses import dataclass, field
 
 from confluent_kafka import KafkaException, Producer
@@ -97,26 +98,64 @@ def make_producer(client_id: str | None = None) -> tuple[Producer, DeliveryTrack
     }
     if client_id:
         config["client.id"] = client_id
+    LOG.info(
+        "kafka producer: building (broker=%s client_id=%s request_timeout=%dms delivery_timeout=%dms)",
+        BROKER,
+        client_id or "<auto>",
+        _REQUEST_TIMEOUT_MS,
+        _DELIVERY_TIMEOUT_MS,
+    )
     return Producer(config), DeliveryTracker()
 
 
 def ensure_topic(topic: str, num_partitions: int = 1) -> None:
     """Create the topic if it doesn't already exist. No-op on race with auto-create."""
+    LOG.info("kafka admin: probing topic %s (broker=%s)", topic, BROKER)
     admin = AdminClient({"bootstrap.servers": BROKER})
-    existing = admin.list_topics(timeout=ADMIN_TIMEOUT_S).topics
+    list_start = time.monotonic()
+    try:
+        existing = admin.list_topics(timeout=ADMIN_TIMEOUT_S).topics
+    except Exception as exc:  # noqa: BLE001
+        LOG.warning(
+            "kafka admin: list_topics failed in %.2fs (timeout=%ds): %s",
+            time.monotonic() - list_start,
+            ADMIN_TIMEOUT_S,
+            exc,
+        )
+        raise
+    LOG.info(
+        "kafka admin: list_topics returned %d topics in %.2fs",
+        len(existing),
+        time.monotonic() - list_start,
+    )
     if topic in existing:
+        LOG.info("kafka admin: topic %s already present; skipping create", topic)
         return
-    LOG.info("creating kafka topic %s with %d partition(s)", topic, num_partitions)
+    LOG.info(
+        "kafka admin: creating topic %s with %d partition(s)", topic, num_partitions
+    )
+    create_start = time.monotonic()
     futures = admin.create_topics(
         [NewTopic(topic, num_partitions=num_partitions, replication_factor=1)]
     )
     for t, fut in futures.items():
         try:
             fut.result(timeout=ADMIN_TIMEOUT_S)
+            LOG.info(
+                "kafka admin: topic %s created in %.2fs",
+                t,
+                time.monotonic() - create_start,
+            )
         except KafkaException as exc:
             # TOPIC_ALREADY_EXISTS = 36
             err = exc.args[0] if exc.args else None
             if err is not None and getattr(err, "code", lambda: None)() == 36:
-                LOG.info("kafka topic %s raced with auto-create; continuing", t)
+                LOG.info("kafka admin: topic %s raced with auto-create; continuing", t)
                 continue
+            LOG.warning(
+                "kafka admin: topic %s create failed in %.2fs: %s",
+                t,
+                time.monotonic() - create_start,
+                exc,
+            )
             raise
diff --git a/test/antithesis/workload/test/helper_logging.py b/test/antithesis/workload/test/helper_logging.py
new file mode 100644
index 0000000000000..96544e2cd816f
--- /dev/null
+++ b/test/antithesis/workload/test/helper_logging.py
@@ -0,0 +1,69 @@
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Logging setup with per-invocation correlation IDs for Antithesis drivers.
+
+Antithesis launches each Test Composer command as a fresh process, and
+many drivers run concurrently in the same timeline. Correlating log
+records back to a specific invocation — "which of the eight in-flight
+parallel_driver_upsert_latest_value processes was the one that stalled
+on flush?" — requires a stable per-invocation token that appears in
+every line.
+
+Use:
+
+    import helper_logging
+    LOG = helper_logging.setup_logging("driver.my_driver")
+    LOG.info("starting; prefix=%s", prefix)
+    # 2026-... INFO driver.my_driver [inv=a3f9b1c2] starting; prefix=p...
+
+The same INVOCATION_ID is exported as a module attribute so drivers can
+include it in assertion-detail dicts, subprocess invocations, Kafka
+client.ids, and anywhere else a correlation token is useful.
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+import secrets
+
+# Short hex string minted once per process at import time. 32 bits is
+# enough to make collisions vanishingly rare across concurrent drivers
+# in one timeline; staying short keeps it readable inline in every line.
+INVOCATION_ID = secrets.token_hex(4)
+
+
+def setup_logging(name: str | None = None) -> logging.Logger:
+    """Install a root handler with the invocation-ID-stamped formatter and
+    return a named logger.
+
+    Idempotent: if the root already has a handler, the configuration is
+    left alone and only the named logger is returned. This means the
+    first driver/helper to call `setup_logging` in a process wins the
+    format, which is fine because every caller passes the same format
+    template.
+
+    Log level defaults to INFO; override via `LOG_LEVEL` env var (e.g.
+    `LOG_LEVEL=DEBUG` to see per-attempt retry logs the helpers emit).
+    """
+    root = logging.getLogger()
+    if not root.handlers:
+        handler = logging.StreamHandler()
+        handler.setFormatter(
+            logging.Formatter(
+                fmt=(
+                    f"%(asctime)s %(levelname)s %(name)s "
+                    f"[inv={INVOCATION_ID}] %(message)s"
+                )
+            )
+        )
+        root.addHandler(handler)
+        root.setLevel(os.environ.get("LOG_LEVEL", "INFO").upper())
+    return logging.getLogger(name)
diff --git a/test/antithesis/workload/test/helper_mysql.py b/test/antithesis/workload/test/helper_mysql.py
index 35f573186b8f7..ea10922ea3061 100644
--- a/test/antithesis/workload/test/helper_mysql.py
+++ b/test/antithesis/workload/test/helper_mysql.py
@@ -49,11 +49,22 @@ def _retryable(exc: BaseException) -> bool:
 
 def _open(host: str, database: str) -> pymysql.connections.Connection:
     """Open a single MySQL connection with retries on transient errors."""
-    deadline = time.monotonic() + _RETRY_BUDGET_S
+    start = time.monotonic()
+    deadline = start + _RETRY_BUDGET_S
     backoff = _RETRY_INITIAL_S
+    attempt = 0
+    LOG.debug(
+        "mysql connect: starting (host=%s db=%s timeout=%ds budget=%ds)",
+        host,
+        database,
+        _CONNECT_TIMEOUT_S,
+        _RETRY_BUDGET_S,
+    )
     while True:
+        attempt += 1
+        attempt_start = time.monotonic()
         try:
-            return pymysql.connect(
+            conn = pymysql.connect(
                 host=host,
                 port=MYSQL_PORT,
                 user="root",
@@ -62,12 +73,35 @@ def _open(host: str, database: str) -> pymysql.connections.Connection:
                 connect_timeout=_CONNECT_TIMEOUT_S,
                 autocommit=True,
             )
+            LOG.info(
+                "mysql connect: %s established on attempt %d in %.2fs (total %.2fs)",
+                host,
+                attempt,
+                time.monotonic() - attempt_start,
+                time.monotonic() - start,
+            )
+            return conn
         except Exception as exc:  # noqa: BLE001
+            elapsed_attempt = time.monotonic() - attempt_start
+            elapsed_total = time.monotonic() - start
             if not _retryable(exc) or time.monotonic() > deadline:
+                LOG.warning(
+                    "mysql connect: %s giving up after attempt %d (%.2fs attempt, %.2fs total): %s",
+                    host,
+                    attempt,
+                    elapsed_attempt,
+                    elapsed_total,
+                    exc,
+                )
                 raise
             LOG.info(
-                "mysql connect to %s retrying after %s; backoff=%.2fs",
+                "mysql connect: %s attempt %d failed in %.2fs (%.2fs of %ds used): %s; "
+                "sleeping %.2fs",
                 host,
+                attempt,
+                elapsed_attempt,
+                elapsed_total,
+                _RETRY_BUDGET_S,
                 exc,
                 backoff,
             )
diff --git a/test/antithesis/workload/test/helper_none_source.py b/test/antithesis/workload/test/helper_none_source.py
index 87a90b1ac6087..d036f8e63aa18 100644
--- a/test/antithesis/workload/test/helper_none_source.py
+++ b/test/antithesis/workload/test/helper_none_source.py
@@ -40,6 +40,11 @@ def ensure_none_text_source() -> None:
     Reuses the shared `antithesis_kafka_conn` Kafka connection so multiple
     drivers don't proliferate connections.
     """
+    LOG.info(
+        "ensure_none_text_source: starting (source=%s topic=%s)",
+        SOURCE_NONE_TEXT,
+        TOPIC_NONE_TEXT,
+    )
     ensure_kafka_connection()
     # CREATE SOURCE issues a Kafka metadata fetch that fails fast if the topic
     # is missing; broker auto-create only fires on a producer write, which
@@ -56,5 +61,7 @@ def ensure_none_text_source() -> None:
         SOURCE_NONE_TEXT,
     )
     LOG.info(
-        "none-envelope source %s ready (topic=%s)", SOURCE_NONE_TEXT, TOPIC_NONE_TEXT
+        "ensure_none_text_source: ready (source=%s topic=%s)",
+        SOURCE_NONE_TEXT,
+        TOPIC_NONE_TEXT,
     )
diff --git a/test/antithesis/workload/test/helper_pg.py b/test/antithesis/workload/test/helper_pg.py
index ac6fada801506..b1262762b2fb7 100644
--- a/test/antithesis/workload/test/helper_pg.py
+++ b/test/antithesis/workload/test/helper_pg.py
@@ -58,6 +58,12 @@
 _RETRY_MAX_S = 2.0
 
 
+def _truncate_sql(sql: str, max_len: int = 120) -> str:
+    """Single-line truncation for logging."""
+    flat = " ".join(sql.split())
+    return flat if len(flat) <= max_len else flat[: max_len - 3] + "..."
+
+
 def _retryable(exc: BaseException) -> bool:
     if isinstance(exc, psycopg.OperationalError):
         return True
@@ -70,9 +76,20 @@ def _retryable(exc: BaseException) -> bool:
 @contextmanager
 def connect(autocommit: bool = True) -> Iterator[psycopg.Connection]:
     """Yield a connection, retrying transient failures up to RETRY_BUDGET_S."""
-    deadline = time.monotonic() + _RETRY_BUDGET_S
+    start = time.monotonic()
+    deadline = start + _RETRY_BUDGET_S
     backoff = _RETRY_INITIAL_S
+    attempt = 0
+    LOG.debug(
+        "pg connect: starting (host=%s port=%d timeout=%ds budget=%ds)",
+        PGHOST,
+        PGPORT,
+        CONNECT_TIMEOUT_S,
+        _RETRY_BUDGET_S,
+    )
     while True:
+        attempt += 1
+        attempt_start = time.monotonic()
         try:
             conn = psycopg.connect(
                 host=PGHOST,
@@ -82,11 +99,35 @@ def connect(autocommit: bool = True) -> Iterator[psycopg.Connection]:
                 connect_timeout=CONNECT_TIMEOUT_S,
                 autocommit=autocommit,
             )
+            LOG.info(
+                "pg connect: established on attempt %d in %.2fs (total %.2fs)",
+                attempt,
+                time.monotonic() - attempt_start,
+                time.monotonic() - start,
+            )
             break
         except Exception as exc:  # noqa: BLE001
+            elapsed_attempt = time.monotonic() - attempt_start
+            elapsed_total = time.monotonic() - start
             if not _retryable(exc) or time.monotonic() > deadline:
+                LOG.warning(
+                    "pg connect: giving up after attempt %d (%.2fs attempt, %.2fs total): %s",
+                    attempt,
+                    elapsed_attempt,
+                    elapsed_total,
+                    exc,
+                )
                 raise
-            LOG.info("pg connect retrying after %s; backoff=%.2fs", exc, backoff)
+            LOG.info(
+                "pg connect: attempt %d failed in %.2fs (%.2fs of %ds budget used): %s; "
+                "sleeping %.2fs",
+                attempt,
+                elapsed_attempt,
+                elapsed_total,
+                _RETRY_BUDGET_S,
+                exc,
+                backoff,
+            )
             time.sleep(backoff)
             backoff = min(backoff * 2, _RETRY_MAX_S)
     try:
@@ -100,17 +141,42 @@ def connect(autocommit: bool = True) -> Iterator[psycopg.Connection]:
 
 def execute_retry(sql: str, params: Sequence[Any] | None = None) -> None:
     """Execute a statement, retrying transient errors. No result returned."""
-    deadline = time.monotonic() + _RETRY_BUDGET_S
+    sql_summary = _truncate_sql(sql)
+    LOG.debug("pg execute: %s", sql_summary)
+    start = time.monotonic()
+    deadline = start + _RETRY_BUDGET_S
     backoff = _RETRY_INITIAL_S
+    attempt = 0
     while True:
+        attempt += 1
         try:
             with connect() as conn, conn.cursor() as cur:
                 cur.execute(sql, params or ())
+            LOG.debug(
+                "pg execute: ok on attempt %d in %.2fs (%s)",
+                attempt,
+                time.monotonic() - start,
+                sql_summary,
+            )
             return
         except Exception as exc:  # noqa: BLE001
             if not _retryable(exc) or time.monotonic() > deadline:
+                LOG.warning(
+                    "pg execute: giving up after %d attempts (%.2fs total) on %s: %s",
+                    attempt,
+                    time.monotonic() - start,
+                    sql_summary,
+                    exc,
+                )
                 raise
-            LOG.info("pg execute retrying after %s", exc)
+            LOG.info(
+                "pg execute: attempt %d failed (%.2fs of %ds used) on %s: %s",
+                attempt,
+                time.monotonic() - start,
+                _RETRY_BUDGET_S,
+                sql_summary,
+                exc,
+            )
             time.sleep(backoff)
             backoff = min(backoff * 2, _RETRY_MAX_S)
 
@@ -134,19 +200,46 @@ def query_retry(
     the rows live at an mz_ts further forward — assigned by the reclock's
     next-probe binding).
     """
-    deadline = time.monotonic() + _RETRY_BUDGET_S
+    sql_summary = _truncate_sql(sql)
+    LOG.debug("pg query: %s (rtr=%s)", sql_summary, real_time_recency)
+    start = time.monotonic()
+    deadline = start + _RETRY_BUDGET_S
     backoff = _RETRY_INITIAL_S
+    attempt = 0
     while True:
+        attempt += 1
         try:
             with connect() as conn, conn.cursor() as cur:
                 if real_time_recency:
                     cur.execute("SET real_time_recency = TRUE")
                 cur.execute(sql, params or ())
-                return list(cur.fetchall())
+                rows = list(cur.fetchall())
+            LOG.debug(
+                "pg query: ok on attempt %d in %.2fs, %d rows (%s)",
+                attempt,
+                time.monotonic() - start,
+                len(rows),
+                sql_summary,
+            )
+            return rows
         except Exception as exc:  # noqa: BLE001
             if not _retryable(exc) or time.monotonic() > deadline:
+                LOG.warning(
+                    "pg query: giving up after %d attempts (%.2fs total) on %s: %s",
+                    attempt,
+                    time.monotonic() - start,
+                    sql_summary,
+                    exc,
+                )
                 raise
-            LOG.info("pg query retrying after %s", exc)
+            LOG.info(
+                "pg query: attempt %d failed (%.2fs of %ds used) on %s: %s",
+                attempt,
+                time.monotonic() - start,
+                _RETRY_BUDGET_S,
+                sql_summary,
+                exc,
+            )
             time.sleep(backoff)
             backoff = min(backoff * 2, _RETRY_MAX_S)
 
@@ -166,9 +259,14 @@ def execute_internal_retry(sql: str, params: Sequence[Any] | None = None) -> Non
     Used for ALTER SYSTEM SET and other operations the regular `materialize`
     role cannot perform. Retries the same transient errors as `execute_retry`.
     """
-    deadline = time.monotonic() + _RETRY_BUDGET_S
+    sql_summary = _truncate_sql(sql)
+    LOG.debug("pg internal execute: %s", sql_summary)
+    start = time.monotonic()
+    deadline = start + _RETRY_BUDGET_S
     backoff = _RETRY_INITIAL_S
+    attempt = 0
     while True:
+        attempt += 1
         try:
             with (
                 psycopg.connect(
@@ -182,11 +280,31 @@ def execute_internal_retry(sql: str, params: Sequence[Any] | None = None) -> Non
                 conn.cursor() as cur,
             ):
                 cur.execute(sql, params or ())
+            LOG.debug(
+                "pg internal execute: ok on attempt %d in %.2fs (%s)",
+                attempt,
+                time.monotonic() - start,
+                sql_summary,
+            )
             return
         except Exception as exc:  # noqa: BLE001
             if not _retryable(exc) or time.monotonic() > deadline:
+                LOG.warning(
+                    "pg internal execute: giving up after %d attempts (%.2fs total) on %s: %s",
+                    attempt,
+                    time.monotonic() - start,
+                    sql_summary,
+                    exc,
+                )
                 raise
-            LOG.info("pg internal execute retrying after %s", exc)
+            LOG.info(
+                "pg internal execute: attempt %d failed (%.2fs of %ds used) on %s: %s",
+                attempt,
+                time.monotonic() - start,
+                _RETRY_BUDGET_S,
+                sql_summary,
+                exc,
+            )
             time.sleep(backoff)
             backoff = min(backoff * 2, _RETRY_MAX_S)
 
diff --git a/test/antithesis/workload/test/helper_source_stats.py b/test/antithesis/workload/test/helper_source_stats.py
index 54af7f0e29866..3216f06c5d900 100644
--- a/test/antithesis/workload/test/helper_source_stats.py
+++ b/test/antithesis/workload/test/helper_source_stats.py
@@ -56,30 +56,41 @@ def wait_for_catchup(
 
     Returns True if catchup completed within `timeout_s`, False on timeout.
     """
-    deadline = time.monotonic() + timeout_s
+    LOG.info(
+        "wait_for_catchup: starting (source=%s target=%d timeout=%.1fs)",
+        source_name,
+        target_offset,
+        timeout_s,
+    )
+    start = time.monotonic()
+    deadline = start + timeout_s
     last_seen: int | None = None
     while time.monotonic() < deadline:
         observed = offset_committed(source_name)
         if observed is not None and observed >= target_offset:
             LOG.info(
-                "source %s caught up: observed=%d target=%d",
+                "wait_for_catchup: source %s caught up in %.2fs (observed=%d target=%d)",
                 source_name,
+                time.monotonic() - start,
                 observed,
                 target_offset,
             )
             return True
         if observed != last_seen:
             LOG.info(
-                "source %s waiting for catchup: observed=%s target=%d",
+                "wait_for_catchup: source %s progress (observed=%s target=%d, %.1fs of %.1fs)",
                 source_name,
                 observed,
                 target_offset,
+                time.monotonic() - start,
+                timeout_s,
             )
             last_seen = observed
         time.sleep(poll_interval_s)
     LOG.warning(
-        "source %s catchup timeout: observed=%s target=%d",
+        "wait_for_catchup: source %s timed out after %.2fs (observed=%s target=%d)",
         source_name,
+        time.monotonic() - start,
         last_seen,
         target_offset,
     )
diff --git a/test/antithesis/workload/test/helper_table_mv.py b/test/antithesis/workload/test/helper_table_mv.py
index e865f3f2f5e89..ec890253593e8 100644
--- a/test/antithesis/workload/test/helper_table_mv.py
+++ b/test/antithesis/workload/test/helper_table_mv.py
@@ -50,6 +50,12 @@ def ensure_table_and_mv() -> None:
     cluster so dataflow execution is colocated with the rest of the
     workload's compute.
     """
+    LOG.info(
+        "ensure_table_and_mv: starting (table=%s mv=%s cluster=%s)",
+        TABLE_MV_INPUT,
+        MV_NAME,
+        CLUSTER,
+    )
     execute_retry(
         f"CREATE TABLE IF NOT EXISTS {TABLE_MV_INPUT} "
         f"(id BIGINT NOT NULL, prefix TEXT NOT NULL)"
@@ -61,4 +67,4 @@ def ensure_table_and_mv() -> None:
         f"FROM {TABLE_MV_INPUT} "
         f"GROUP BY prefix"
     )
-    LOG.info("table %s and MV %s ready", TABLE_MV_INPUT, MV_NAME)
+    LOG.info("ensure_table_and_mv: ready (table=%s mv=%s)", TABLE_MV_INPUT, MV_NAME)
diff --git a/test/antithesis/workload/test/helper_upsert_source.py b/test/antithesis/workload/test/helper_upsert_source.py
index 6fac93cdd4f24..2457f522a148b 100644
--- a/test/antithesis/workload/test/helper_upsert_source.py
+++ b/test/antithesis/workload/test/helper_upsert_source.py
@@ -35,10 +35,16 @@
 
 
 def ensure_kafka_connection() -> None:
+    LOG.info(
+        "ensure_kafka_connection: starting (name=%s broker=%s)",
+        CONNECTION_NAME,
+        KAFKA_BROKER,
+    )
     execute_retry(
         f"CREATE CONNECTION IF NOT EXISTS {CONNECTION_NAME} "
         f"TO KAFKA (BROKER '{KAFKA_BROKER}', SECURITY PROTOCOL = 'PLAINTEXT')"
     )
+    LOG.info("ensure_kafka_connection: done (name=%s)", CONNECTION_NAME)
 
 
 def ensure_upsert_text_source() -> None:
@@ -46,6 +52,11 @@ def ensure_upsert_text_source() -> None:
 
     The resulting source has columns `key TEXT NOT NULL` and `text TEXT`.
     """
+    LOG.info(
+        "ensure_upsert_text_source: starting (source=%s topic=%s)",
+        SOURCE_UPSERT_TEXT,
+        TOPIC_UPSERT_TEXT,
+    )
     ensure_kafka_connection()
     ensure_topic(TOPIC_UPSERT_TEXT)
     create_source_idempotent(
@@ -56,4 +67,8 @@ def ensure_upsert_text_source() -> None:
         f"ENVELOPE UPSERT",
         SOURCE_UPSERT_TEXT,
     )
-    LOG.info("upsert source %s ready (topic=%s)", SOURCE_UPSERT_TEXT, TOPIC_UPSERT_TEXT)
+    LOG.info(
+        "ensure_upsert_text_source: ready (source=%s topic=%s)",
+        SOURCE_UPSERT_TEXT,
+        TOPIC_UPSERT_TEXT,
+    )
diff --git a/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py b/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py
index cfda086d9acc7..2b7a488fdb7e1 100755
--- a/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py
+++ b/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py
@@ -43,9 +43,9 @@
 
 from __future__ import annotations
 
-import logging
 import sys
 
+import helper_logging
 import helper_random
 from helper_kafka import FLUSH_TIMEOUT_S, make_producer
 from helper_none_source import (
@@ -58,10 +58,7 @@
 
 from antithesis.assertions import always, sometimes
 
-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
-)
-LOG = logging.getLogger("driver.kafka_none_envelope")
+LOG = helper_logging.setup_logging("driver.kafka_none_envelope")
 
 # Knobs. Tuned so each invocation is a small, self-contained unit of work
 # — Antithesis launches the driver many times and accumulates coverage
diff --git a/test/antithesis/workload/test/parallel_driver_mv_reflects_table_updates.py b/test/antithesis/workload/test/parallel_driver_mv_reflects_table_updates.py
index 635efe79fac69..389749dc55a81 100755
--- a/test/antithesis/workload/test/parallel_driver_mv_reflects_table_updates.py
+++ b/test/antithesis/workload/test/parallel_driver_mv_reflects_table_updates.py
@@ -42,20 +42,17 @@
 
 from __future__ import annotations
 
-import logging
 import sys
 import time
 
+import helper_logging
 import helper_random
 from helper_pg import execute_retry, query_one_retry
 from helper_table_mv import MV_NAME, TABLE_MV_INPUT, ensure_table_and_mv
 
 from antithesis.assertions import always, sometimes
 
-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
-)
-LOG = logging.getLogger("driver.mv_reflects_table_updates")
+LOG = helper_logging.setup_logging("driver.mv_reflects_table_updates")
 
 INSERTS_PER_INVOCATION = 40
 # Sized to span at least one MAX_OFF window from the global fault-
diff --git a/test/antithesis/workload/test/parallel_driver_mysql_cdc.py b/test/antithesis/workload/test/parallel_driver_mysql_cdc.py
index 4537985fe64ca..b70b6f230a7c8 100644
--- a/test/antithesis/workload/test/parallel_driver_mysql_cdc.py
+++ b/test/antithesis/workload/test/parallel_driver_mysql_cdc.py
@@ -33,10 +33,10 @@
 
 from __future__ import annotations
 
-import logging
 import sys
 import time
 
+import helper_logging
 import helper_mysql
 import helper_random
 from helper_mysql_source import SOURCE_NAME, TABLE_NAME
@@ -44,10 +44,7 @@
 
 from antithesis.assertions import always, sometimes
 
-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
-)
-LOG = logging.getLogger("driver.mysql_cdc")
+LOG = helper_logging.setup_logging("driver.mysql_cdc")
 
 ROWS_PER_INVOCATION = 20
 # Sized to span at least one MAX_OFF window from the global fault-
diff --git a/test/antithesis/workload/test/parallel_driver_mysql_myisam.py b/test/antithesis/workload/test/parallel_driver_mysql_myisam.py
index a7075a276c365..8e0e28043a371 100644
--- a/test/antithesis/workload/test/parallel_driver_mysql_myisam.py
+++ b/test/antithesis/workload/test/parallel_driver_mysql_myisam.py
@@ -47,10 +47,10 @@
 
 from __future__ import annotations
 
-import logging
 import sys
 import time
 
+import helper_logging
 import helper_mysql
 import helper_random
 from helper_mysql_source import (
@@ -63,10 +63,7 @@
 
 from antithesis.assertions import always, sometimes
 
-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
-)
-LOG = logging.getLogger("driver.mysql_myisam")
+LOG = helper_logging.setup_logging("driver.mysql_myisam")
 
 ROWS_PER_INVOCATION = 20
 # Sized to span at least one MAX_OFF window from the global fault-
diff --git a/test/antithesis/workload/test/parallel_driver_parallel_workload.py b/test/antithesis/workload/test/parallel_driver_parallel_workload.py
index 6a73f39b4be62..b8f5f536248e2 100644
--- a/test/antithesis/workload/test/parallel_driver_parallel_workload.py
+++ b/test/antithesis/workload/test/parallel_driver_parallel_workload.py
@@ -38,13 +38,13 @@
 
 from __future__ import annotations
 
-import logging
 import os
 import sys
 import threading
 import time
 from typing import Any
 
+import helper_logging
 import helper_random
 import psycopg
 from helper_pg import (
@@ -94,10 +94,7 @@
 _pw_executor.logging = None
 _pw_executor.lock = threading.Lock()
 
-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
-)
-LOG = logging.getLogger("driver.parallel_workload")
+LOG = helper_logging.setup_logging("driver.parallel_workload")
 
 # Antithesis Test Composer invokes drivers in tight loops, so this script is
 # intentionally short. The cap exists so a single iteration can't monopolise
diff --git a/test/antithesis/workload/test/parallel_driver_strict_serializable_reads.py b/test/antithesis/workload/test/parallel_driver_strict_serializable_reads.py
index 71cb339149018..33430bd3c1ebe 100755
--- a/test/antithesis/workload/test/parallel_driver_strict_serializable_reads.py
+++ b/test/antithesis/workload/test/parallel_driver_strict_serializable_reads.py
@@ -52,11 +52,11 @@
 
 from __future__ import annotations
 
-import logging
 import os
 import sys
 import time
 
+import helper_logging
 import helper_random
 import psycopg
 from helper_pg import (
@@ -70,10 +70,7 @@
 
 from antithesis.assertions import always, sometimes
 
-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
-)
-LOG = logging.getLogger("driver.strict_serializable_reads")
+LOG = helper_logging.setup_logging("driver.strict_serializable_reads")
 
 STEPS_PER_INVOCATION = 12
 # Sized to span at least one MAX_OFF window from the global fault-
diff --git a/test/antithesis/workload/test/parallel_driver_upsert_ancient_key_writable.py b/test/antithesis/workload/test/parallel_driver_upsert_ancient_key_writable.py
index 8ea69f67c04b1..23e5b706bb65b 100644
--- a/test/antithesis/workload/test/parallel_driver_upsert_ancient_key_writable.py
+++ b/test/antithesis/workload/test/parallel_driver_upsert_ancient_key_writable.py
@@ -55,9 +55,9 @@
 
 from __future__ import annotations
 
-import logging
 import sys
 
+import helper_logging
 import helper_random
 from helper_kafka import FLUSH_TIMEOUT_S, make_producer
 from helper_pg import query_retry
@@ -70,10 +70,7 @@
 
 from antithesis.assertions import always, sometimes
 
-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
-)
-LOG = logging.getLogger("driver.upsert_ancient_key_writable")
+LOG = helper_logging.setup_logging("driver.upsert_ancient_key_writable")
 
 # Fixed key ring owned exclusively by this driver. No other driver writes
 # keys matching this prefix, so the property's assertions are race-free
diff --git a/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py b/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py
index 125e71b7c114f..af3af4222e8ad 100755
--- a/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py
+++ b/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py
@@ -34,9 +34,9 @@
 
 from __future__ import annotations
 
-import logging
 import sys
 
+import helper_logging
 import helper_random
 from helper_kafka import FLUSH_TIMEOUT_S, make_producer
 from helper_pg import query_one_retry
@@ -49,10 +49,7 @@
 
 from antithesis.assertions import always, sometimes
 
-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
-)
-LOG = logging.getLogger("driver.upsert_latest_value")
+LOG = helper_logging.setup_logging("driver.upsert_latest_value")
 
 # Knobs. Kept small per-invocation because Antithesis launches the driver many
 # times; total coverage comes from re-invocations, not from one huge run.
diff --git a/test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py b/test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py
index fd9c7cf389001..defba0cfb5203 100755
--- a/test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py
+++ b/test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py
@@ -46,11 +46,11 @@
 
 from __future__ import annotations
 
-import logging
 import os
 import sys
 import time
 
+import helper_logging
 import helper_random
 import psycopg
 from helper_pg import (
@@ -64,10 +64,7 @@
 
 from antithesis.assertions import always, sometimes
 
-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
-)
-LOG = logging.getLogger("driver.catalog_recovery_consistency")
+LOG = helper_logging.setup_logging("driver.catalog_recovery_consistency")
 
 # Long-running knobs: the driver owns its timeline and the per-cycle budget
 # has to comfortably exceed environmentd's restart time so a fault landing
diff --git a/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py b/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py
index 58e1de5c18ac4..f12cefc4f45d1 100755
--- a/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py
+++ b/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py
@@ -55,10 +55,10 @@
 
 from __future__ import annotations
 
-import logging
 import sys
 import time
 
+import helper_logging
 import helper_random
 from helper_kafka import FLUSH_TIMEOUT_S, make_producer
 from helper_pg import query_one_retry
@@ -71,10 +71,7 @@
 
 from antithesis.assertions import always, sometimes
 
-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s"
-)
-LOG = logging.getLogger("driver.upsert_state_rehydration")
+LOG = helper_logging.setup_logging("driver.upsert_state_rehydration")
 
 # Long-running knobs — this driver owns its timeline alongside parallel
 # drivers, so the per-cycle budget is generous and the cycle count high

From d3962c5cc79805e8eaf971b30de8edad6461f24d Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Thu, 14 May 2026 20:43:34 -0400
Subject: [PATCH 61/65] test/antithesis: helper_pg: retry server-side
 InternalError from broker validation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A driver invocation failed after one attempt with:

  WARNING pg execute: giving up after 1 attempts (17.88s total) on
  CREATE SOURCE IF NOT EXISTS upsert_text_src IN CLUSTER antithesis_cluster
  FROM KAFKA CONNECTION antithesis_kafka_conn ...:
  Meta data fetch error: BrokerTransportFailure (Local: Broker transport failure)

Materialize validates `CREATE SOURCE FOR KAFKA` by doing a broker
metadata fetch as part of the planning path. Under the global fault-
orchestrator's faults-ON windows, that fetch fails — and materialize
surfaces the failure as a plain `psycopg.errors.InternalError`, *not*
`OperationalError`. Our `_retryable` predicate only recognised
OperationalError + InterfaceError, so the driver burned through one
attempt in 17s and aborted instead of waiting for the next quiet window.

Widen `_retryable` to also accept `InternalError` whose message matches
one of a small set of patterns identifying transient external
dependencies:

  * librdkafka surfaces:  BrokerTransportFailure, Meta data fetch error,
                          "Local: All broker connections are down",
                          "Local: Timed out"
  * schema-registry HTTP: "schema registry", Connection refused/reset
  * DNS partitioned:      Failed to resolve hostname, no route to host,
                          Temporary failure in name resolution
  * postgres/mysql source upstream:
                          could not translate host name,
                          could not connect to server

Patterns are intentionally narrow so a real catalog/schema error
("relation does not exist", "syntax error at or near...") still
propagates after one attempt rather than spinning silently.

Applies to execute_retry / query_retry / execute_internal_retry via the
shared predicate; `create_source_idempotent` keeps its separate
already-exists-tolerance path on top.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 test/antithesis/workload/test/helper_pg.py | 39 ++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/test/antithesis/workload/test/helper_pg.py b/test/antithesis/workload/test/helper_pg.py
index b1262762b2fb7..da12098217823 100644
--- a/test/antithesis/workload/test/helper_pg.py
+++ b/test/antithesis/workload/test/helper_pg.py
@@ -64,12 +64,51 @@ def _truncate_sql(sql: str, max_len: int = 120) -> str:
     return flat if len(flat) <= max_len else flat[: max_len - 3] + "..."
 
 
+# Substring matches for server-side `InternalError` whose root cause is a
+# transient external dependency rather than a workload bug. Materialize
+# surfaces broker/upstream validation failures during DDL (notably
+# `CREATE SOURCE FOR KAFKA`, which does a broker metadata fetch as part
+# of validation) as plain `InternalError`, *not* `OperationalError`, so
+# the default psycopg classification treats them as non-retryable. Under
+# the global fault-orchestrator a broker pause is expected to clear in
+# the next quiet window; we should keep trying.
+#
+# Keep the patterns specific enough that we don't accidentally swallow
+# real schema errors. Anything not matched here still propagates after
+# one attempt.
+_TRANSIENT_INTERNAL_PATTERNS = (
+    # librdkafka error surfaces (CREATE SOURCE / CREATE CONNECTION validation)
+    "BrokerTransportFailure",
+    "Broker transport failure",
+    "Meta data fetch error",
+    "Local: All broker connections are down",
+    "Local: Timed out",
+    # schema-registry HTTP failures during CREATE SOURCE FORMAT AVRO ...
+    "schema registry",
+    "Connection refused",
+    "Connection reset",
+    # DNS partitioned against an upstream hostname
+    "Failed to resolve hostname",
+    "Temporary failure in name resolution",
+    "no route to host",
+    # postgres / mysql source validation reach-the-upstream failures
+    "could not translate host name",
+    "could not connect to server",
+)
+
+
 def _retryable(exc: BaseException) -> bool:
     if isinstance(exc, psycopg.OperationalError):
         return True
     # psycopg wraps server-side admin shutdowns as InterfaceError on next op.
     if isinstance(exc, psycopg.InterfaceError):
         return True
+    # Server-side InternalError caused by a transient external dependency
+    # (broker metadata fetch during CREATE SOURCE validation, schema-
+    # registry unavailable, etc.) — see `_TRANSIENT_INTERNAL_PATTERNS`.
+    if isinstance(exc, psycopg.errors.InternalError):
+        msg = str(exc)
+        return any(pat in msg for pat in _TRANSIENT_INTERNAL_PATTERNS)
     return False
 
 

From 714e252a507de2331f42c5918ca68ca135a88142 Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Thu, 14 May 2026 21:15:06 -0400
Subject: [PATCH 62/65] test/antithesis: revert clusterd workers back to 4
 (bisection)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Almost no workloads have been finishing on recent builds. Dov noted
that Antithesis's deterministic hypervisor runs the whole fleet on a
single core, which makes the workers-per-clusterd bump from 4 to 16
(commit 86d1fbbc058b, "bump clusterd workers to 16 and shrink pool
to 2") look suspicious:

  * Total Timely workers across the four clusterd containers went from
    40 to 64.
  * Per-process worker count went from 4 to 16 — sixteen work-stealing
    threads sharing one core would burn most wakeups on context-switch
    overhead and starve dependent steps.

Revert just `CLUSTERD_WORKERS = 16 -> 4` (keeping the pool size at 2;
pool size by itself shouldn't cause this). Regenerate the compose
yaml; every CREATE CLUSTER REPLICAS WORKERS clause now matches the
clusterd `workers=` argument again.

If a build at this commit shows workloads finishing again, the workers
bump is the regression and we go back to 4. If they still don't
finish, the regression is elsewhere (rpqlkrmq's container_name +
hostname + named-bridge-network is the next candidate).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 test/antithesis/config/docker-compose.yaml | 18 ++++++------
 test/antithesis/mzcompose.py               | 32 ++++++++++------------
 2 files changed, 24 insertions(+), 26 deletions(-)

diff --git a/test/antithesis/config/docker-compose.yaml b/test/antithesis/config/docker-compose.yaml
index cf9a35d147c97..58120c7ad9160 100644
--- a/test/antithesis/config/docker-compose.yaml
+++ b/test/antithesis/config/docker-compose.yaml
@@ -283,10 +283,10 @@ services:
     - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879
     - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0
     - CLUSTERD_PROCESS=0
-    - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 16, "process": 0, "addresses": ["clusterd1:2102"],
+    - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd1:2102"],
       "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
       false, "zero_copy_limit": null}'
-    - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 16, "process": 0, "addresses": ["clusterd1:2103"],
+    - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd1:2103"],
       "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
       false, "zero_copy_limit": null}'
     volumes:
@@ -325,10 +325,10 @@ services:
     - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879
     - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0
     - CLUSTERD_PROCESS=0
-    - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 16, "process": 0, "addresses": ["clusterd2:2102"],
+    - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd2:2102"],
       "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
       false, "zero_copy_limit": null}'
-    - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 16, "process": 0, "addresses": ["clusterd2:2103"],
+    - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd2:2103"],
       "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
       false, "zero_copy_limit": null}'
     volumes:
@@ -367,10 +367,10 @@ services:
     - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879
     - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0
     - CLUSTERD_PROCESS=0
-    - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 16, "process": 0, "addresses": ["clusterd-pool-0:2102"],
+    - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-0:2102"],
       "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
       false, "zero_copy_limit": null}'
-    - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 16, "process": 0, "addresses": ["clusterd-pool-0:2103"],
+    - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-0:2103"],
       "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
       false, "zero_copy_limit": null}'
     volumes:
@@ -409,10 +409,10 @@ services:
     - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879
     - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0
     - CLUSTERD_PROCESS=0
-    - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 16, "process": 0, "addresses": ["clusterd-pool-1:2102"],
+    - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-1:2102"],
       "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
       false, "zero_copy_limit": null}'
-    - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 16, "process": 0, "addresses": ["clusterd-pool-1:2103"],
+    - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-1:2103"],
       "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc":
       false, "zero_copy_limit": null}'
     volumes:
@@ -710,7 +710,7 @@ services:
     - MZ_ANTITHESIS_CLUSTER=antithesis_cluster
     - ANTITHESIS_CLUSTERD_POOL_SIZE=2
     - CLUSTERD_POOL_SIZE=2
-    - CLUSTERD_WORKERS=16
+    - CLUSTERD_WORKERS=4
     - MYSQL_HOST=mysql
     - MYSQL_REPLICA_HOST=mysql-replica
     - MYSQL_PASSWORD=p@ssw0rd
diff --git a/test/antithesis/mzcompose.py b/test/antithesis/mzcompose.py
index 9fef5db40faed..3e63ef8096e19 100644
--- a/test/antithesis/mzcompose.py
+++ b/test/antithesis/mzcompose.py
@@ -67,12 +67,12 @@
 # topology closer to production replica counts.
 CLUSTERD_POOL_SIZE = int(os.environ.get("ANTITHESIS_CLUSTERD_POOL_SIZE", "2"))
 
-# Timely worker threads per clusterd process. Bumped to 16 to match the
-# per-process worker density of larger production cluster sizes — single-
-# process clusterds at workers=16 cover the same intra-process
-# concurrency surface as a 4-process scale=4,workers=4 production
-# deployment, so we exercise per-shard parallelism, scheduler contention,
-# and the Antithesis thread-pause fault target with realistic depth.
+# Timely worker threads per clusterd process. Reverted from 16 back to 4
+# on suspicion that Antithesis's deterministic hypervisor runs the whole
+# fleet on a single core — 16 work-stealing Timely workers per process
+# on one core would burn most of their wakeups on context-switch
+# overhead and starve dependent steps, which would manifest as
+# workloads never finishing.
 #
 # This value must stay in lockstep with the `WORKERS N` clause in every
 # CREATE CLUSTER REPLICAS statement that targets these containers
@@ -80,7 +80,7 @@
 # the Workload service passes through; the parallel-workload Python
 # driver consumes the same env via the framework's pool-cluster
 # wrapper).
-CLUSTERD_WORKERS = 16
+CLUSTERD_WORKERS = 4
 
 
 class FaultOrchestrator(Service):
@@ -244,16 +244,14 @@ def __init__(self) -> None:
     # Antithesis kill either replica's backing container without taking
     # the workload offline.
     #
-    # `workers=CLUSTERD_WORKERS` (16) per clusterd means each replica runs
-    # that many timely worker threads in one process. Sized to cover the
-    # per-process worker density of larger production cluster sizes:
-    # single-process clusterds at workers=16 exercise the same
-    # intra-process concurrency surface as a 4-process scale=4,workers=4
-    # production deployment (per-shard parallelism, scheduler contention,
-    # Antithesis thread-pause fault targets). The matching `WORKERS N`
-    # clause in every CREATE CLUSTER REPLICAS statement must equal this
-    # — workload-entrypoint.sh reads CLUSTERD_WORKERS from the env the
-    # Workload service exports.
+    # `workers=CLUSTERD_WORKERS` (4) per clusterd means each replica runs
+    # four timely worker threads in one process. Was bumped to 16 to
+    # match production single-process density but reverted on suspicion
+    # that Antithesis's single-core hypervisor turns 16-thread work-
+    # stealing into a context-switch storm that starves progress. The
+    # matching `WORKERS N` clause in every CREATE CLUSTER REPLICAS
+    # statement must equal this — workload-entrypoint.sh reads
+    # CLUSTERD_WORKERS from the env the Workload service exports.
     #
     # `scratch_directory=None` matches production: cluster replicas in
     # cloud deployments don't get a scratch disk, so the upsert operator's

From 3c50003b7dc6ca539e638b22004e1a3a9ec0aaf2 Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Fri, 15 May 2026 16:21:18 -0400
Subject: [PATCH 63/65] test/antithesis: add Postgres CDC driver +
 testdrive-runner singleton
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a Postgres CDC pipeline (postgres-source upstream + Materialize
source pg_cdc_source on antithesis_pg_cdc.cdc_test) and the
parallel_driver_pg_cdc data-loss workload that uses it. Also adds a
singleton testdrive-runner that picks a random test/pg-cdc/*.td file
on each invocation and runs it through the bundled testdrive binary —
the Antithesis-side anchor for the open ci-flakes on PG CDC
(database-issues#9931, #9571, and the rest of the suite).

Pieces:

- mzcompose.py: adds postgres-source (logical wal_level, small
  max_slot_wal_keep_size so the corresponding td file's slot-
  invalidation assertion can fire); plumbs PG_SOURCE_* + the JSON
  CLUSTER_REPLICA_SIZES map into the workload service env.
- export-compose.py: postgres-source gets a network alias 'postgres'
  so the repo's td files (which hardcode @postgres) resolve unchanged;
  inline_postgres_setup is now scoped to setup_materialize=True PG
  services only.
- workload/Dockerfile: multi-stage MZFROM testdrive copies the binary +
  libduckdb.so + libfdb_c.so in. Base bumped to python:3.12-slim-trixie
  for testdrive's GLIBC 2.38+ requirement. Bundles test/pg-cdc/*.td via
  a pre-image: copy step.
- workload/test/helper_pg_upstream.py, helper_pg_source.py,
  first_pg_cdc_setup.py, parallel_driver_pg_cdc.py: the PG-CDC data-loss
  workload, mirroring the MySQL family. Lives in its own
  antithesis_pg_cdc schema so it coexists with the testdrive-runner
  (which owns public).
- workload/test/helper_testdrive.py: runs a checked-in td file through
  testdrive after stripping its skip-if-true header. reset_*_state
  helpers wipe everything that isn't the data-loss workload's state
  (sources, clusters, tables, connections, secrets, schemas on the
  materialize side; publications, replication slots, schemas, roles on
  the upstream side).
- workload/test/singleton_driver_pg_cdc_testdrive.py: enumerates
  /opt/testdrive-files/test/pg-cdc/*.td, picks one at random per
  invocation, runs it under the property
  pg-cdc-testdrive-suite-no-spurious-failure. Six files are excluded;
  reasons documented inline (SSL, trust-auth divergence, RI
  validation tightening, #9571 test/product drift, pg_hba customization
  needed).

47 of 53 td files are in rotation. Sweep at one-pick-per-file
confirmed all 47 pass cleanly without fault injection.
---
 .../test/antithesis/workload/Dockerfile       |  68 ---
 .../test/antithesis/workload/mzbuild.yml      |  39 --
 test/antithesis/config/.env                   |  24 +-
 test/antithesis/config/docker-compose.yaml    | 166 ++++++
 test/antithesis/export-compose.py             |  41 +-
 test/antithesis/mzcompose.py                  |  52 +-
 .../scratchbook/property-catalog.md           |  40 ++
 test/antithesis/workload/.gitignore           |   1 +
 test/antithesis/workload/Dockerfile           |  34 +-
 test/antithesis/workload/mzbuild.yml          |  11 +
 .../workload/test/first_pg_cdc_setup.py       | 109 ++++
 .../workload/test/helper_pg_source.py         | 128 +++++
 .../workload/test/helper_pg_upstream.py       | 167 ++++++
 .../workload/test/helper_testdrive.py         | 534 ++++++++++++++++++
 .../workload/test/parallel_driver_pg_cdc.py   | 234 ++++++++
 .../test/singleton_driver_pg_cdc_testdrive.py | 207 +++++++
 16 files changed, 1719 insertions(+), 136 deletions(-)
 delete mode 100644 misc/python/test/antithesis/workload/Dockerfile
 delete mode 100644 misc/python/test/antithesis/workload/mzbuild.yml
 create mode 100644 test/antithesis/workload/test/first_pg_cdc_setup.py
 create mode 100644 test/antithesis/workload/test/helper_pg_source.py
 create mode 100644 test/antithesis/workload/test/helper_pg_upstream.py
 create mode 100644 test/antithesis/workload/test/helper_testdrive.py
 create mode 100644 test/antithesis/workload/test/parallel_driver_pg_cdc.py
 create mode 100644 test/antithesis/workload/test/singleton_driver_pg_cdc_testdrive.py

diff --git a/misc/python/test/antithesis/workload/Dockerfile b/misc/python/test/antithesis/workload/Dockerfile
deleted file mode 100644
index 513a8d75b5a04..0000000000000
--- a/misc/python/test/antithesis/workload/Dockerfile
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright Materialize, Inc. and contributors. All rights reserved.
-#
-# Use of this software is governed by the Business Source License
-# included in the LICENSE file at the root of this repository.
-#
-# As of the Change Date specified in that file, in accordance with
-# the Business Source License, use of this software will be governed
-# by the Apache License, Version 2.0.
-
-# Antithesis workload client for Materialize.
-#
-# Python-based test driver that connects to materialized via pgwire,
-# produces Kafka messages, and emits Antithesis assertions. The
-# parallel-workload driver reuses the real `materialize.parallel_workload`
-# Python package — see mzbuild.yml for the pre-image copy of the slice it
-# needs, and stubs/materialize/mzcompose/ for the docker-compose surface we
-# have to mock out.
-
-FROM python:3.12-slim-bookworm
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    curl \
-    postgresql-client \
-    && rm -rf /var/lib/apt/lists/*
-
-# `confluent-kafka[avro]` pulls fastavro, which `data_ingest.executor`
-# imports at module top via `confluent_kafka.schema_registry.avro`.
-# `pg8000`, `websocket-client`, `requests`, `xxhash`, `zstandard` cover the
-# rest of the module-load-time imports walking from `parallel_workload` →
-# `data_ingest` → `materialize.util`.
-RUN pip install --no-cache-dir \
-    psycopg[binary]==3.2.9 \
-    "confluent-kafka[avro]==2.8.0" \
-    antithesis==0.2.0 \
-    PyMySQL==1.1.1 \
-    pg8000==1.31.2 \
-    websocket-client==1.8.0 \
-    requests==2.32.3 \
-    xxhash==3.5.0 \
-    zstandard==0.23.0
-
-# setup-complete script
-COPY setup-complete.sh /usr/local/bin/setup-complete.sh
-RUN chmod +x /usr/local/bin/setup-complete.sh
-
-# Test template directory — populated by antithesis-workload skill later
-RUN mkdir -p /opt/antithesis/test/v1/materialize
-
-# Catalog directory for Python assertion cataloging
-RUN mkdir -p /opt/antithesis/catalog
-
-# Ship the `materialize.*` Python package needed by the parallel-workload
-# driver. Stubs are copied first so that the real parallel-workload code
-# layered on top can satisfy its top-level `from materialize.mzcompose...`
-# imports against tiny placeholders. `MZ_ROOT` is required by
-# `materialize/__init__.py` at import time — point it at the package root.
-COPY stubs/materialize/ /opt/antithesis-pkg/materialize/
-COPY materialize/ /opt/antithesis-pkg/materialize/
-ENV PYTHONPATH=/opt/antithesis-pkg
-ENV MZ_ROOT=/opt/antithesis-pkg
-
-# Copy test templates and entrypoint
-COPY test/ /opt/antithesis/test/v1/materialize/
-COPY workload-entrypoint.sh /usr/local/bin/workload-entrypoint.sh
-RUN chmod +x /usr/local/bin/workload-entrypoint.sh
-RUN chmod +x /opt/antithesis/test/v1/materialize/* 2>/dev/null || true
-
-ENTRYPOINT ["/usr/local/bin/workload-entrypoint.sh"]
diff --git a/misc/python/test/antithesis/workload/mzbuild.yml b/misc/python/test/antithesis/workload/mzbuild.yml
deleted file mode 100644
index 2d69faddfd065..0000000000000
--- a/misc/python/test/antithesis/workload/mzbuild.yml
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright Materialize, Inc. and contributors. All rights reserved.
-#
-# Use of this software is governed by the Business Source License
-# included in the LICENSE file at the root of this repository.
-#
-# As of the Change Date specified in that file, in accordance with
-# the Business Source License, use of this software will be governed
-# by the Apache License, Version 2.0.
-
-name: antithesis-workload
-
-# The parallel-workload driver reuses the real `materialize.parallel_workload`
-# Python package rather than reimplementing it. Copy the slice of
-# `misc/python/materialize/` it needs into the build context so the Dockerfile
-# can bundle it into the image. Everything in `materialize.mzcompose.*` is
-# replaced by tiny stubs (see `stubs/materialize/mzcompose/`) — Antithesis
-# injects faults at the container layer, so the workload container has no
-# docker-compose orchestration to call into.
-pre-image:
-  - type: copy
-    source: misc/python
-    destination: .
-    matching: materialize/__init__.py
-  - type: copy
-    source: misc/python
-    destination: .
-    matching: materialize/util.py
-  - type: copy
-    source: misc/python
-    destination: .
-    matching: materialize/sqlsmith.py
-  - type: copy
-    source: misc/python
-    destination: .
-    matching: materialize/parallel_workload
-  - type: copy
-    source: misc/python
-    destination: .
-    matching: materialize/data_ingest
diff --git a/test/antithesis/config/.env b/test/antithesis/config/.env
index d4f160a98596f..341a92886750c 100644
--- a/test/antithesis/config/.env
+++ b/test/antithesis/config/.env
@@ -1,21 +1,5 @@
-# Copyright Materialize, Inc. and contributors. All rights reserved.
-#
-# Use of this software is governed by the Business Source License
-# included in the LICENSE file at the root of this repository.
-#
-# As of the Change Date specified in that file, in accordance with
-# the Business Source License, use of this software will be governed
-# by the Apache License, Version 2.0.
-
-# Compose env-file for `test/antithesis/config/docker-compose.yaml`.
-# Tracked by git only so that the file exists for mzbuild's input
-# fingerprinting and survives `git clean -ffdX` between builds. The
-# committed values are placeholders — `build-antithesis.sh` overwrites
-# them in CI with refs to images pushed to Antithesis's GCP Artifact
-# Registry, and `make export-env` does the same with local-dev refs.
-#
-# If you see these placeholder values on a running cluster, your build
-# pipeline did not regenerate this file. Run:
+# GENERATED FILE — do not edit. Regenerate via:
 #   bin/pyactivate test/antithesis/export-env.py > test/antithesis/config/.env
-MATERIALIZED_IMAGE=placeholder-not-built
-ANTITHESIS_WORKLOAD_IMAGE=placeholder-not-built
+# Consumed by test/antithesis/config/docker-compose.yaml at compose-parse time.
+MATERIALIZED_IMAGE=ghcr.io/materializeinc/materialize/materialized:mzbuild-KCOH6PR3STRLZAH6O3VVGMJYMH3H3ZTJ
+ANTITHESIS_WORKLOAD_IMAGE=ghcr.io/materializeinc/materialize/antithesis-workload:mzbuild-SSVB2N7XZF62H4MIKAY3G3JCPTPDZ3AX
diff --git a/test/antithesis/config/docker-compose.yaml b/test/antithesis/config/docker-compose.yaml
index 58120c7ad9160..d070d283b67ec 100644
--- a/test/antithesis/config/docker-compose.yaml
+++ b/test/antithesis/config/docker-compose.yaml
@@ -260,6 +260,42 @@ services:
     hostname: mysql-replica
     networks:
     - antithesis-net
+  postgres-source:
+    command:
+    - postgres
+    - -c
+    - wal_level=logical
+    - -c
+    - max_wal_senders=100
+    - -c
+    - max_replication_slots=100
+    - -c
+    - max_connections=5000
+    - -c
+    - max_slot_wal_keep_size=64MB
+    ports:
+    - '5432'
+    environment:
+    - POSTGRESDB=postgres
+    - POSTGRES_PASSWORD=postgres
+    - POSTGRES_HOST_AUTH_METHOD=trust
+    healthcheck:
+      test:
+      - CMD
+      - pg_isready
+      - -U
+      - postgres
+      interval: 1s
+      start_period: 30s
+    restart: 'no'
+    platform: linux/amd64
+    image: postgres:17.7
+    container_name: postgres-source
+    hostname: postgres-source
+    networks:
+      antithesis-net:
+        aliases:
+        - postgres
   clusterd1:
     entrypoint:
     - tini
@@ -298,6 +334,7 @@ services:
     stop_grace_period: 120s
     platform: linux/amd64
     image: ${MATERIALIZED_IMAGE}
+    pull_policy: never
     container_name: clusterd1
     hostname: clusterd1
     networks:
@@ -340,6 +377,7 @@ services:
     stop_grace_period: 120s
     platform: linux/amd64
     image: ${MATERIALIZED_IMAGE}
+    pull_policy: never
     container_name: clusterd2
     hostname: clusterd2
     networks:
@@ -382,6 +420,7 @@ services:
     stop_grace_period: 120s
     platform: linux/amd64
     image: ${MATERIALIZED_IMAGE}
+    pull_policy: never
     container_name: clusterd-pool-0
     hostname: clusterd-pool-0
     networks:
@@ -424,6 +463,7 @@ services:
     stop_grace_period: 120s
     platform: linux/amd64
     image: ${MATERIALIZED_IMAGE}
+    pull_policy: never
     container_name: clusterd-pool-1
     hostname: clusterd-pool-1
     networks:
@@ -616,6 +656,7 @@ services:
     stop_grace_period: 120s
     platform: linux/amd64
     image: ${MATERIALIZED_IMAGE}
+    pull_policy: never
     container_name: materialized
     networks:
     - antithesis-net
@@ -699,6 +740,8 @@ services:
         condition: service_healthy
       mysql-replica:
         condition: service_healthy
+      postgres-source:
+        condition: service_healthy
     environment:
     - PGHOST=materialized
     - PGPORT=6875
@@ -714,8 +757,131 @@ services:
     - MYSQL_HOST=mysql
     - MYSQL_REPLICA_HOST=mysql-replica
     - MYSQL_PASSWORD=p@ssw0rd
+    - PG_SOURCE_HOST=postgres-source
+    - PG_SOURCE_PORT=5432
+    - PG_SOURCE_USER=postgres
+    - PG_SOURCE_PASSWORD=postgres
+    - PG_SOURCE_DATABASE=postgres
+    - 'CLUSTER_REPLICA_SIZES={"bootstrap": {"cpu_exclusive": false, "cpu_limit": null,
+      "credits_per_hour": "1", "disabled": false, "disk_limit": null, "is_cc": true,
+      "memory_limit": "4 GiB", "scale": 1, "workers": 1}, "scale=2,workers=4": {"cpu_exclusive":
+      false, "cpu_limit": null, "credits_per_hour": "8", "disabled": false, "disk_limit":
+      null, "is_cc": true, "memory_limit": "4 GiB", "scale": 2, "workers": 4}, "scale=1,workers=1,legacy":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "1", "disabled":
+      false, "disk_limit": null, "is_cc": false, "memory_limit": "4 GiB", "scale":
+      1, "workers": 1}, "scale=1,workers=2,legacy": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "2", "disabled": false, "disk_limit": null, "is_cc":
+      false, "memory_limit": "4 GiB", "scale": 1, "workers": 2}, "free": {"cpu_exclusive":
+      false, "cpu_limit": null, "credits_per_hour": "1", "disabled": true, "disk_limit":
+      null, "is_cc": true, "memory_limit": "4 GiB", "scale": 1, "workers": 1}, "scale=1,workers=1":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "1", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale":
+      1, "workers": 1}, "scale=1,workers=1,mem=4GiB": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "1", "disabled": false, "disk_limit": null, "is_cc":
+      true, "memory_limit": "4 GiB", "scale": 1, "workers": 1}, "scale=1,workers=1,mem=8GiB":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "1", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "8 GiB", "scale":
+      1, "workers": 1}, "scale=1,workers=1,mem=16GiB": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "1", "disabled": false, "disk_limit": null, "is_cc":
+      true, "memory_limit": "16 GiB", "scale": 1, "workers": 1}, "scale=1,workers=1,mem=32GiB":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "1", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "32 GiB", "scale":
+      1, "workers": 1}, "scale=1,workers=1,mem=1GiB": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "1", "disabled": false, "disk_limit": null, "is_cc":
+      true, "memory_limit": "1 GiB", "scale": 1, "workers": 1}, "scale=1,workers=2":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "2", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale":
+      1, "workers": 2}, "scale=1,workers=2,mem=4GiB": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "2", "disabled": false, "disk_limit": null, "is_cc":
+      true, "memory_limit": "4 GiB", "scale": 1, "workers": 2}, "scale=1,workers=2,mem=8GiB":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "2", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "8 GiB", "scale":
+      1, "workers": 2}, "scale=1,workers=2,mem=16GiB": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "2", "disabled": false, "disk_limit": null, "is_cc":
+      true, "memory_limit": "16 GiB", "scale": 1, "workers": 2}, "scale=1,workers=2,mem=32GiB":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "2", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "32 GiB", "scale":
+      1, "workers": 2}, "scale=2,workers=1": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "2", "disabled": false, "disk_limit": null, "is_cc":
+      true, "memory_limit": "4 GiB", "scale": 2, "workers": 1}, "scale=2,workers=2":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "4", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale":
+      2, "workers": 2}, "scale=1,workers=2,mem=2GiB": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "2", "disabled": false, "disk_limit": null, "is_cc":
+      true, "memory_limit": "2 GiB", "scale": 1, "workers": 2}, "scale=1,workers=4":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "4", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale":
+      1, "workers": 4}, "scale=1,workers=4,mem=4GiB": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "4", "disabled": false, "disk_limit": null, "is_cc":
+      true, "memory_limit": "4 GiB", "scale": 1, "workers": 4}, "scale=1,workers=4,mem=8GiB":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "4", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "8 GiB", "scale":
+      1, "workers": 4}, "scale=1,workers=4,mem=16GiB": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "4", "disabled": false, "disk_limit": null, "is_cc":
+      true, "memory_limit": "16 GiB", "scale": 1, "workers": 4}, "scale=1,workers=4,mem=32GiB":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "4", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "32 GiB", "scale":
+      1, "workers": 4}, "scale=4,workers=1": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "4", "disabled": false, "disk_limit": null, "is_cc":
+      true, "memory_limit": "4 GiB", "scale": 4, "workers": 1}, "scale=4,workers=4":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "16", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale":
+      4, "workers": 4}, "scale=1,workers=8": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "8", "disabled": false, "disk_limit": null, "is_cc":
+      true, "memory_limit": "4 GiB", "scale": 1, "workers": 8}, "scale=1,workers=8,mem=4GiB":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "8", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale":
+      1, "workers": 8}, "scale=1,workers=8,mem=8GiB": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "8", "disabled": false, "disk_limit": null, "is_cc":
+      true, "memory_limit": "8 GiB", "scale": 1, "workers": 8}, "scale=1,workers=8,mem=16GiB":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "8", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "16 GiB", "scale":
+      1, "workers": 8}, "scale=1,workers=8,mem=32GiB": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "8", "disabled": false, "disk_limit": null, "is_cc":
+      true, "memory_limit": "32 GiB", "scale": 1, "workers": 8}, "scale=8,workers=1":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "8", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale":
+      8, "workers": 1}, "scale=8,workers=8": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "64", "disabled": false, "disk_limit": null, "is_cc":
+      true, "memory_limit": "4 GiB", "scale": 8, "workers": 8}, "scale=1,workers=16":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "16", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale":
+      1, "workers": 16}, "scale=1,workers=16,mem=4GiB": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "16", "disabled": false, "disk_limit": null, "is_cc":
+      true, "memory_limit": "4 GiB", "scale": 1, "workers": 16}, "scale=1,workers=16,mem=8GiB":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "16", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "8 GiB", "scale":
+      1, "workers": 16}, "scale=1,workers=16,mem=16GiB": {"cpu_exclusive": false,
+      "cpu_limit": null, "credits_per_hour": "16", "disabled": false, "disk_limit":
+      null, "is_cc": true, "memory_limit": "16 GiB", "scale": 1, "workers": 16}, "scale=1,workers=16,mem=32GiB":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "16", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "32 GiB", "scale":
+      1, "workers": 16}, "scale=16,workers=1": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "16", "disabled": false, "disk_limit": null, "is_cc":
+      true, "memory_limit": "4 GiB", "scale": 16, "workers": 1}, "scale=16,workers=16":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "256", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale":
+      16, "workers": 16}, "scale=1,workers=32": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "32", "disabled": false, "disk_limit": null, "is_cc":
+      true, "memory_limit": "4 GiB", "scale": 1, "workers": 32}, "scale=1,workers=32,mem=4GiB":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "32", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale":
+      1, "workers": 32}, "scale=1,workers=32,mem=8GiB": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "32", "disabled": false, "disk_limit": null, "is_cc":
+      true, "memory_limit": "8 GiB", "scale": 1, "workers": 32}, "scale=1,workers=32,mem=16GiB":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "32", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "16 GiB", "scale":
+      1, "workers": 32}, "scale=1,workers=32,mem=32GiB": {"cpu_exclusive": false,
+      "cpu_limit": null, "credits_per_hour": "32", "disabled": false, "disk_limit":
+      null, "is_cc": true, "memory_limit": "32 GiB", "scale": 1, "workers": 32}, "scale=32,workers=1":
+      {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "32", "disabled":
+      false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale":
+      32, "workers": 1}, "scale=32,workers=32": {"cpu_exclusive": false, "cpu_limit":
+      null, "credits_per_hour": "1024", "disabled": false, "disk_limit": null, "is_cc":
+      true, "memory_limit": "4 GiB", "scale": 32, "workers": 32}}'
     platform: linux/amd64
     image: ${ANTITHESIS_WORKLOAD_IMAGE}
+    pull_policy: never
     container_name: workload
     hostname: workload
     networks:
diff --git a/test/antithesis/export-compose.py b/test/antithesis/export-compose.py
index b1921f10474be..3d8a471641df6 100644
--- a/test/antithesis/export-compose.py
+++ b/test/antithesis/export-compose.py
@@ -110,18 +110,30 @@ def inline_postgres_setup(svc: dict[str, Any]) -> None:
     Antithesis has no host filesystem, so we can't mount the SQL file.
     Read it from misc/postgres/setup_materialize.sql (one source of truth)
     and bake it into the service entrypoint.
+
+    The inline-setup transform only fires when the service originally
+    requested it (Postgres-ctor `setup_materialize=True`, which appears
+    here as a bind-mounted setup_materialize.sql). Plain postgres-image
+    services — e.g. a vanilla PG used as a CDC upstream — get the
+    common env-fixup (drop LD_PRELOAD, add HOST_AUTH_METHOD=trust) and
+    nothing else.
     """
     if not svc.get("image", "").startswith("postgres:"):
         return
 
+    vols = svc.get("volumes", []) or []
+    has_setup = any(isinstance(v, str) and "setup_materialize.sql" in v for v in vols)
+
     env = svc.setdefault("environment", [])
     # eatmydata isn't installed in the public postgres image.
     env[:] = [e for e in env if not e.startswith("LD_PRELOAD=")]
     # Trust auth — Antithesis-internal traffic only.
     env.append("POSTGRES_HOST_AUTH_METHOD=trust")
 
+    if not has_setup:
+        return
+
     # Drop the bind-mounted setup SQL; we'll inline it.
-    vols = svc.get("volumes", [])
     vols[:] = [v for v in vols if "setup_materialize.sql" not in v]
     if not vols:
         svc.pop("volumes", None)
@@ -203,13 +215,30 @@ def strip_mzcompose_keys(svc: dict[str, Any]) -> None:
 ANTITHESIS_NETWORK = "antithesis-net"
 
 
-def assign_network(svc: dict[str, Any]) -> None:
+# Extra docker-network aliases per service. The repository's
+# `test/pg-cdc/*.td` files hard-code `@postgres` as the upstream hostname;
+# the testdrive-runner drivers run those files unmodified by aliasing
+# `postgres` to our `postgres-source` container at the network-DNS layer.
+EXTRA_NETWORK_ALIASES: dict[str, list[str]] = {
+    "postgres-source": ["postgres"],
+}
+
+
+def assign_network(name: str, svc: dict[str, Any]) -> None:
     """Place the service on the single named bridge network so docker-DNS
     is deterministic. Overwrites any pre-existing `networks` entry — some
     upstream Service classes set a vestigial `default: aliases: []` block
     that we don't want carried through.
+
+    Services that need additional names on the same network (see
+    `EXTRA_NETWORK_ALIASES`) use the long-form mapping syntax so we can
+    declare `aliases:` for them. Plain services keep the short list form.
     """
-    svc["networks"] = [ANTITHESIS_NETWORK]
+    aliases = EXTRA_NETWORK_ALIASES.get(name)
+    if aliases:
+        svc["networks"] = {ANTITHESIS_NETWORK: {"aliases": aliases}}
+    else:
+        svc["networks"] = [ANTITHESIS_NETWORK]
 
 
 def declare_top_level_network(compose: dict[str, Any]) -> None:
@@ -269,9 +298,7 @@ def upgrade_started_to_healthy(compose: dict[str, Any]) -> None:
     — there's nothing to wait on.
     """
     services = compose.get("services", {})
-    has_healthcheck = {
-        name for name, svc in services.items() if "healthcheck" in svc
-    }
+    has_healthcheck = {name for name, svc in services.items() if "healthcheck" in svc}
     for svc in services.values():
         deps = svc.get("depends_on")
         if not isinstance(deps, dict):
@@ -328,7 +355,7 @@ def main() -> None:
         strip_incompatible_env(svc)
         strip_mzcompose_keys(svc)
         set_explicit_names(name, svc)
-        assign_network(svc)
+        assign_network(name, svc)
 
     declare_top_level_network(c.compose)
     upgrade_started_to_healthy(c.compose)
diff --git a/test/antithesis/mzcompose.py b/test/antithesis/mzcompose.py
index 3e63ef8096e19..8167aef466a89 100644
--- a/test/antithesis/mzcompose.py
+++ b/test/antithesis/mzcompose.py
@@ -45,9 +45,11 @@
   bin/pyactivate test/antithesis/export-compose.py > config/...     # dump compose YAML
 """
 
+import json
 import os
 from pathlib import Path
 
+from materialize.mzcompose import cluster_replica_size_map
 from materialize.mzcompose.composition import Composition
 from materialize.mzcompose.service import Service, ServiceConfig
 from materialize.mzcompose.services.clusterd import Clusterd
@@ -55,7 +57,7 @@
 from materialize.mzcompose.services.materialized import Materialized
 from materialize.mzcompose.services.minio import Minio
 from materialize.mzcompose.services.mysql import MySql, create_mysql_server_args
-from materialize.mzcompose.services.postgres import PostgresMetadata
+from materialize.mzcompose.services.postgres import Postgres, PostgresMetadata
 from materialize.mzcompose.services.schema_registry import SchemaRegistry
 from materialize.mzcompose.services.zookeeper import Zookeeper
 
@@ -167,6 +169,7 @@ def __init__(self) -> None:
                 "schema-registry": {"condition": "service_started"},
                 "mysql": {"condition": "service_healthy"},
                 "mysql-replica": {"condition": "service_healthy"},
+                "postgres-source": {"condition": "service_healthy"},
             },
             "environment": [
                 "PGHOST=materialized",
@@ -197,6 +200,26 @@ def __init__(self) -> None:
                 "MYSQL_HOST=mysql",
                 "MYSQL_REPLICA_HOST=mysql-replica",
                 f"MYSQL_PASSWORD={MySql.DEFAULT_ROOT_PASSWORD}",
+                # Postgres CDC upstream connection details. Materialize
+                # talks to this PG directly via a logical replication slot
+                # — production Postgres CDC is single-instance, unlike the
+                # MySQL primary+replica topology above.
+                "PG_SOURCE_HOST=postgres-source",
+                "PG_SOURCE_PORT=5432",
+                "PG_SOURCE_USER=postgres",
+                "PG_SOURCE_PASSWORD=postgres",
+                "PG_SOURCE_DATABASE=postgres",
+                # The testdrive binary inside the workload image reads
+                # this from the env (clap `env=CLUSTER_REPLICA_SIZES`)
+                # and uses it for any `CREATE CLUSTER REPLICAS (... SIZE
+                # '...')` statement in a checked-in `.td` file. Without
+                # it, testdrive aborts at startup with "required argument
+                # missing". The map matches what `materialized` is
+                # actually configured with, so the size names a test
+                # file references (`scale=1,workers=1`, `'1'`, ...) all
+                # resolve.
+                "CLUSTER_REPLICA_SIZES="
+                + json.dumps(cluster_replica_size_map()),
             ],
         }
         super().__init__(name="workload", config=config)
@@ -237,6 +260,32 @@ def __init__(self) -> None:
             "--replica_preserve_commit_order=ON",
         ],
     ),
+    # Postgres source — single instance with logical replication enabled.
+    # Materialize talks to this PG directly via a replication slot, which
+    # is how PG CDC is deployed in production (unlike MySQL where Mz reads
+    # from a replica). Separate from `postgres-metadata` so Antithesis
+    # faults on the source path don't interfere with consensus storage.
+    #
+    # `wal_level=logical`, `max_wal_senders`, `max_replication_slots` are
+    # set by Postgres' default ctor. `setup_materialize=False` keeps the
+    # init scripts out — the PG-CDC setup driver creates the schema,
+    # publication, and table at runtime.
+    #
+    # `max_slot_wal_keep_size=64MB` bounds how much WAL a stalled
+    # replication slot is allowed to retain. With the default `-1`
+    # (unlimited), `test/pg-cdc/max-slot-wal-keep-size.td` can't trigger
+    # the slot-invalidation error it's checking for — the bulk insert
+    # plus two full-table updates the test does just accumulate quietly,
+    # Materialize catches up cleanly on resume, and the `! SELECT ...
+    # contains:replication slot has been invalidated` assertion fails.
+    # 64MB is small enough that the test's ~6GB of dirty pages overruns
+    # it deterministically, large enough that no realistic non-test
+    # workload trips it.
+    Postgres(
+        name="postgres-source",
+        ports=["5432"],
+        extra_command=["-c", "max_slot_wal_keep_size=64MB"],
+    ),
     # Two clusterd processes, one per replica of the unmanaged
     # `antithesis_cluster`. Provisioning both replicas in the same cluster
     # exercises multi-replica source ingestion and compute paths
@@ -328,6 +377,7 @@ def workflow_default(c: Composition) -> None:
         *pool_services,
         "mysql",
         "mysql-replica",
+        "postgres-source",
     )
     c.up("materialized")
     c.up("fault-orchestrator")
diff --git a/test/antithesis/scratchbook/property-catalog.md b/test/antithesis/scratchbook/property-catalog.md
index 067a63f755e8c..39804f51a1edc 100644
--- a/test/antithesis/scratchbook/property-catalog.md
+++ b/test/antithesis/scratchbook/property-catalog.md
@@ -466,3 +466,43 @@ commit-order preservation) to the Antithesis environment.
 | **Invariant** | `Always`: a polling assertion in the workload — `SELECT offset_known, offset_committed FROM mz_internal.mz_source_statistics_per_worker WHERE id = ?` — invariant `offset_known >= offset_committed`. Mirror as an `assert_always!` inside the statistics update path in `src/storage/src/statistics.rs`. |
 | **Antithesis Angle** | Clusterd restart resets `offset_known` to broker-reported watermark while `offset_committed` is restored from persist. If the restoration order is wrong, the invariant flips. Direct regression target for commit 3e32df1f69. |
 | **Why It Matters** | The statistics view is consumed by users and by operational tooling to compute lag. A regression in causality makes lag metrics meaningless and is the kind of bug that survives unit tests but fails under adversarial timing. |
+
+## Category 10: Postgres CDC Source
+
+Properties specific to Materialize's Postgres CDC source pipeline, which
+subscribes to an upstream Postgres instance via a logical replication slot.
+Production Postgres CDC is single-instance — there is no equivalent of the
+MySQL primary→replica intermediate hop — so the topology adds a single
+`postgres-source` PG container (logical wal_level, replication-slot
+enabled) to the Antithesis environment.
+
+Motivation: the open `ci-flake` issues that exercise the PG CDC path
+currently have no Antithesis-side property to anchor a reproduction
+against — notably database-issues#9571 (alter-source error mismatch),
+#9931 (dropped-slot-errors), and #10047 (SSH-tunnel + CRDB metadata
+race). This category gives those flakes a home.
+
+### pg-cdc-testdrive-suite-no-spurious-failure — The Repo's PG CDC Test Suite Runs Cleanly Under Antithesis Fault Injection
+
+| | |
+|---|---|
+| **Type** | Safety (no non-transient testdrive failure) + Liveness (suite occasionally runs clean end-to-end) |
+| **Priority** | P1 — every checked-in `test/pg-cdc/*.td` regression test becomes an Antithesis property automatically. Direct regression target for the cluster of open ci-flakes on PG CDC: database-issues#9931 (dropped-slot-errors, currently CI-disabled), #9571 (alter-source.td, currently CI-disabled), #10047, and the broader testdrive-on-PG-CDC family. |
+| **Status** | **Implemented (workload-side, testdrive-runner)** — `test/antithesis/workload/test/singleton_driver_pg_cdc_testdrive.py` enumerates `/opt/testdrive-files/test/pg-cdc/*.td` (bundled into the antithesis-workload image via `pre-image: copy` in mzbuild.yml + `MZFROM testdrive` for the binary), picks one at random per invocation via `helper_random.random_choice`, and runs it via `helper_testdrive.run`. Two transforms happen at runtime: (i) the `$ skip-if / SELECT true` disable header is stripped so CI-disabled tests actually execute, and (ii) Materialize / upstream PG state is reset before and after each run. Singleton — at most one instance executes concurrently. The td files own `public` schema and the `mz_source`/`pgpass`/`pg`/`storage` names exclusively; the data-loss workload lives in `antithesis_pg_cdc.cdc_test` and is preserved by the reset helpers. |
+| **Property** | For every `.td` file in the repository's `test/pg-cdc/` suite (modulo the SSL-fixture exclusion list, which needs a TLS-configured upstream we don't model), running it via testdrive under Antithesis fault injection either succeeds or fails with output matching a recognized transient pattern (connection refused, server (re)initializing, etc.). A non-transient failure means at least one `>`/`!` checkpoint inside the testdrive script disagreed with observed Materialize behavior — the property is violated and the schedule that produced it is what Antithesis surfaces to triage. |
+| **Invariant** | `Always`: `result.succeeded OR result.looks_transient`. The assertion message is constant; the `td_file`, exit code, and stdout/stderr tails travel in the details so triage breaks results down per-file. `Sometimes`: `result.succeeded` — at least once per run, a randomly-selected file runs cleanly end-to-end, proving the safety check is not vacuously satisfied by always-transient demotion. |
+| **Antithesis Angle** | Antithesis explores schedules around the destructive sequences the td files already encode: DROP REPLICATION SLOT mid-ingestion (dropped-slot-errors.td), DROP TABLE racing with snapshot (alter-source.td, alter-table-after-source-{1,2}.td), schema changes that propagate during pause windows (replica-identity.td), transactions split across upstream-side faults (transactions.td, transactions-multi-conn.td), and 50+ more. Layered on top are the orchestrator's fault windows (kill clusterd, pause materialized, partition the upstream PG), so each invocation samples a different intersection of "destructive testdrive checkpoint × adversarial schedule". |
+| **Why It Matters** | The repo's pg-cdc test suite encodes Materialize's documented behavior on the upstream-PG CDC source — it's the closest we have to a contract spec. Under CI alone, several files in it are disabled pending the fixes Antithesis is meant to drive. Under this property, every checked-in test becomes a continuously-exercised property; new tests added to `test/pg-cdc/` are picked up automatically on the next image rebuild without a driver-level edit. |
+
+### pg-source-no-data-loss — Every Row Written to Upstream Postgres Is Eventually Visible
+
+| | |
+|---|---|
+| **Type** | Liveness + Safety |
+| **Priority** | P1 — end-to-end correctness of the PG CDC pipeline; tests a distinct code path from Kafka and MySQL |
+| **Status** | **Implemented (workload-side)** — `test/antithesis/workload/test/parallel_driver_pg_cdc.py` + `first_pg_cdc_setup.py`. Each `parallel_driver_` invocation inserts 20 rows into the upstream PG's `antithesis_pg_cdc.cdc_test`, polls `antithesis_pg_cdc` in Materialize until all rows appear (or a 120 s budget expires). The data-loss workload lives in its own schema (not `public`) so the testdrive-runner drivers (`pg-source-survives-slot-drop`, `pg-alter-source-no-spurious-success`) can own `public` and run concurrently without trampling. `always("pg: CDC source row has correct value after catchup", …)` and `always("pg: CDC source row count matches inserted count after catchup", …)` fire per-row and per-batch after confirmed catchup; `sometimes("pg: CDC source caught up to all upstream inserts within catchup budget", …)` is the liveness anchor. The `first_pg_cdc_setup.py` seeds the upstream PG (schema + table with `REPLICA IDENTITY FULL`, `PUBLICATION antithesis_pub`) and creates the Materialize-side secret/connection/source/table. |
+| **Property** | After inserting a row into the upstream Postgres (via logical replication on a publication), the Materialize CDC source eventually contains that row with the correct value. |
+| **Invariant** | `Always`: after catchup, for every row inserted into `public.cdc_test`, `SELECT value FROM antithesis_pg_cdc WHERE id = ?` returns the expected value. `Sometimes`: catchup completes within the budget at least once per run. |
+| **Antithesis Angle** | Kills/pauses of the upstream PG container (Materialize must resume from the replication slot LSN without dropping rows); network partitions between materialized and the upstream (replication slot is server-side state; partition + recovery exercises the resume path); clusterd restarts during ingestion (PG CDC resume exercises the same `storage-command-replay-idempotent` path as Kafka and MySQL); fault during DDL on the Materialize side (`ALTER SOURCE`-class races that produce wrong error in #9571). |
+| **Why It Matters** | PG CDC is a distinct ingestion code path from Kafka and MySQL, with a different fault model: the replication slot is durable server-side state, so a slot that drifts past the consumer's LSN cannot replay older WAL. Wrong behavior here — dropped rows, wrong values after restart, duplicate rows after resume, source stuck with `replication slot does not exist` — is not caught by the Kafka- or MySQL-source drivers. Direct regression target for the cluster of open PG-source flakes in database-issues. |
+
diff --git a/test/antithesis/workload/.gitignore b/test/antithesis/workload/.gitignore
index 2c028d08d5e96..4fe55a19d3a9e 100644
--- a/test/antithesis/workload/.gitignore
+++ b/test/antithesis/workload/.gitignore
@@ -1,3 +1,4 @@
 # Populated at image-build time by the `pre-image: type: copy` directives in
 # mzbuild.yml — committing them would diverge from the source tree.
 /materialize/
+/testdrive-files/
diff --git a/test/antithesis/workload/Dockerfile b/test/antithesis/workload/Dockerfile
index 513a8d75b5a04..5797ea84b7c7e 100644
--- a/test/antithesis/workload/Dockerfile
+++ b/test/antithesis/workload/Dockerfile
@@ -16,7 +16,21 @@
 # needs, and stubs/materialize/mzcompose/ for the docker-compose surface we
 # have to mock out.
 
-FROM python:3.12-slim-bookworm
+# Multi-stage source for the testdrive binary used by the testdrive-runner
+# driver (singleton_driver_pg_cdc_testdrive). MZFROM is mzbuild's multi-
+# stage directive — it registers a build-order dependency on the
+# `testdrive` image and substitutes the resolved fingerprint at build
+# time. The final workload image carries the binary plus libduckdb.so (a
+# transitive dep of testdrive's foundationdb + duckdb-sys features), so
+# the workload container can run testdrive without any further service
+# plumbing.
+MZFROM testdrive AS testdrive-src
+
+# Debian 13 (trixie) base ships GLIBC 2.41; testdrive's mzbuild image is
+# built against ubuntu-base (Ubuntu 24.04, GLIBC 2.39) and references
+# GLIBC 2.38 / 2.39 symbols. The previous `python:3.12-slim-bookworm`
+# (Debian 12, GLIBC 2.36) couldn't load the testdrive binary.
+FROM python:3.12-slim-trixie
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
     curl \
@@ -59,6 +73,24 @@ COPY materialize/ /opt/antithesis-pkg/materialize/
 ENV PYTHONPATH=/opt/antithesis-pkg
 ENV MZ_ROOT=/opt/antithesis-pkg
 
+# Copy the testdrive binary plus the two shared libs it dynamically
+# links against (testdrive is built with the foundationdb feature which
+# pulls libfdb_c, and duckdb-sys which pulls libduckdb). `ldconfig`
+# after the copy rebuilds the dynamic-linker cache so the loader finds
+# them. Without libfdb_c.so testdrive errors at startup with
+# "cannot open shared object file"; without libduckdb.so it errors
+# inside the duckdb-backed `kafka-create-topic` builtins.
+COPY --from=testdrive-src /usr/local/bin/testdrive /usr/local/bin/testdrive
+COPY --from=testdrive-src /usr/local/lib/libduckdb.so /usr/local/lib/libduckdb.so
+COPY --from=testdrive-src /usr/lib/libfdb_c.so /usr/lib/libfdb_c.so
+RUN ldconfig
+
+# Repo-checked-in testdrive .td files for the testdrive-runner drivers,
+# brought into the build context by the `pre-image: copy` step in
+# mzbuild.yml. The drivers reference these by repo-relative path under
+# /opt/testdrive-files, e.g. /opt/testdrive-files/test/pg-cdc/dropped-slot-errors.td.
+COPY testdrive-files/ /opt/testdrive-files/
+
 # Copy test templates and entrypoint
 COPY test/ /opt/antithesis/test/v1/materialize/
 COPY workload-entrypoint.sh /usr/local/bin/workload-entrypoint.sh
diff --git a/test/antithesis/workload/mzbuild.yml b/test/antithesis/workload/mzbuild.yml
index b957b4f8a2046..69923d21a1a08 100644
--- a/test/antithesis/workload/mzbuild.yml
+++ b/test/antithesis/workload/mzbuild.yml
@@ -35,3 +35,14 @@ pre-image:
     source: misc/python
     destination: .
     matching: materialize/data_ingest/**
+  # Repository-checked-in testdrive scripts that the testdrive-runner
+  # drivers execute under Antithesis fault injection. The driver applies
+  # a tiny runtime preprocessor (strips the `$ skip-if SELECT true`
+  # header used to disable these in CI) but otherwise runs the .td files
+  # unchanged. Keeping the source-of-truth path means new tests in
+  # `test/pg-cdc/` are exercised against Antithesis as soon as their
+  # filenames are referenced by a singleton driver.
+  - type: copy
+    source: .
+    destination: testdrive-files
+    matching: test/pg-cdc/*.td
diff --git a/test/antithesis/workload/test/first_pg_cdc_setup.py b/test/antithesis/workload/test/first_pg_cdc_setup.py
new file mode 100644
index 0000000000000..469de86c642e7
--- /dev/null
+++ b/test/antithesis/workload/test/first_pg_cdc_setup.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Antithesis first_ command: configure the upstream PG and create the
+Materialize Postgres CDC source.
+
+Runs once per Antithesis timeline before any parallel/singleton drivers.
+Steps:
+  1. Wait for the upstream PG to accept connections.
+  2. CREATE TABLE public.cdc_test, set REPLICA IDENTITY FULL (so DELETEs
+     surface every column, not just the PK).
+  3. CREATE PUBLICATION antithesis_pub FOR TABLE public.cdc_test.
+  4. Create the Materialize-side secret/connection/source/table via
+     helper_pg_source.ensure_pg_cdc_source.
+
+REPLICA IDENTITY FULL is what the parallel driver's `_check_rows` semantic
+relies on — without it, Materialize sees the new row image on inserts but
+only the PK on deletes, which is fine for the count assertion but limits
+what we can validate downstream.
+"""
+
+from __future__ import annotations
+
+import sys
+
+import helper_logging
+import helper_pg_upstream
+from helper_pg_source import (
+    UPSTREAM_PUBLICATION,
+    UPSTREAM_SCHEMA,
+    UPSTREAM_TABLE,
+    ensure_pg_cdc_source,
+)
+
+from antithesis.assertions import reachable
+
+LOG = helper_logging.setup_logging("first.pg_cdc_setup")
+
+
+def setup_upstream() -> None:
+    """Create the cdc_test table and publication on the upstream PG."""
+    LOG.info("creating %s.%s on upstream PG", UPSTREAM_SCHEMA, UPSTREAM_TABLE)
+    # The data-loss workload owns its own schema (not `public`) so the
+    # testdrive-runner drivers can own `public` exclusively.
+    helper_pg_upstream.execute(f"CREATE SCHEMA IF NOT EXISTS {UPSTREAM_SCHEMA}")
+    helper_pg_upstream.execute(
+        f"""
+        CREATE TABLE IF NOT EXISTS {UPSTREAM_SCHEMA}.{UPSTREAM_TABLE} (
+            id          TEXT PRIMARY KEY,
+            batch_id    TEXT NOT NULL,
+            value       TEXT NOT NULL,
+            updated_at  TIMESTAMPTZ NOT NULL DEFAULT now()
+        )
+        """
+    )
+    # REPLICA IDENTITY FULL: send the entire old row in every UPDATE/DELETE
+    # record. Without it, DELETE only carries the PK, which is enough for
+    # CDC correctness but means the upstream row image and the
+    # Materialize row image only line up on INSERT — making it harder to
+    # write per-row assertions that survive the full mutation cycle.
+    helper_pg_upstream.execute(
+        f"ALTER TABLE {UPSTREAM_SCHEMA}.{UPSTREAM_TABLE} REPLICA IDENTITY FULL"
+    )
+
+    # CREATE PUBLICATION isn't IF-NOT-EXISTS friendly in PG, so we
+    # idempotency-guard via the catalog.
+    rows = helper_pg_upstream.query(
+        "SELECT 1 FROM pg_publication WHERE pubname = %s",
+        (UPSTREAM_PUBLICATION,),
+    )
+    if not rows:
+        helper_pg_upstream.execute(
+            f"CREATE PUBLICATION {UPSTREAM_PUBLICATION} "
+            f"FOR TABLE {UPSTREAM_SCHEMA}.{UPSTREAM_TABLE}"
+        )
+        LOG.info("publication %s created", UPSTREAM_PUBLICATION)
+    else:
+        LOG.info(
+            "publication %s already present; skipping create", UPSTREAM_PUBLICATION
+        )
+
+
+def main() -> int:
+    LOG.info("waiting for upstream PG (%s)...", helper_pg_upstream.PG_HOST)
+    helper_pg_upstream.wait_until_ready()
+
+    setup_upstream()
+    ensure_pg_cdc_source()
+
+    reachable(
+        "pg: first-run setup complete — upstream PG seeded, Materialize source created",
+        {
+            "upstream": helper_pg_upstream.PG_HOST,
+            "publication": UPSTREAM_PUBLICATION,
+        },
+    )
+    LOG.info("Postgres CDC setup complete")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/test/antithesis/workload/test/helper_pg_source.py b/test/antithesis/workload/test/helper_pg_source.py
new file mode 100644
index 0000000000000..c3c58e0f90cab
--- /dev/null
+++ b/test/antithesis/workload/test/helper_pg_source.py
@@ -0,0 +1,128 @@
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Idempotent setup for the Antithesis Postgres CDC source in Materialize.
+
+The pipeline:
+  postgres-source --logical replication slot--> Materialize
+
+Production Postgres CDC is single-instance — there's no equivalent of
+the MySQL replica intermediate. Materialize opens a replication slot on
+the upstream PG and consumes WAL directly. Faults on the upstream
+exercise the source recovery path; faults on materialized exercise the
+replication-slot-resume path.
+
+Objects created in Materialize:
+  - SECRET     antithesis_pg_password
+  - CONNECTION antithesis_pg_conn  -> postgres-source
+  - SOURCE     pg_cdc_source        (IN CLUSTER antithesis_cluster)
+  - TABLE      antithesis_pg_cdc    (REFERENCE antithesis_pg_cdc.cdc_test)
+
+Objects created on the upstream PG:
+  - SCHEMA      antithesis_pg_cdc
+  - TABLE       antithesis_pg_cdc.cdc_test (REPLICA IDENTITY FULL so
+                 deletes carry the old row even with no PK column added)
+  - PUBLICATION antithesis_pub FOR TABLE antithesis_pg_cdc.cdc_test
+
+Note: the data-loss workload deliberately owns its own schema rather
+than `public`. The repository's `test/pg-cdc/*.td` testdrive files
+assume exclusive ownership of `public` (they DROP/CREATE it), so
+running them under Antithesis alongside this driver requires that we
+stay out of their way.
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+
+import psycopg
+from helper_pg import create_source_idempotent, execute_retry, query_retry
+
+LOG = logging.getLogger("antithesis.helper_pg_source")
+
+CLUSTER = os.environ.get("MZ_ANTITHESIS_CLUSTER", "antithesis_cluster")
+PG_SOURCE_HOST = os.environ.get("PG_SOURCE_HOST", "postgres-source")
+PG_SOURCE_PORT = int(os.environ.get("PG_SOURCE_PORT", "5432"))
+PG_SOURCE_USER = os.environ.get("PG_SOURCE_USER", "postgres")
+PG_SOURCE_PASSWORD = os.environ.get("PG_SOURCE_PASSWORD", "postgres")
+PG_SOURCE_DATABASE = os.environ.get("PG_SOURCE_DATABASE", "postgres")
+
+# Upstream-side names. The schema deliberately is NOT `public` so the
+# testdrive-runner drivers (which assume exclusive ownership of `public`)
+# can run concurrently with the data-loss workload without trampling.
+UPSTREAM_SCHEMA = "antithesis_pg_cdc"
+UPSTREAM_TABLE = "cdc_test"
+UPSTREAM_PUBLICATION = "antithesis_pub"
+
+# Materialize-side names.
+SECRET_NAME = "antithesis_pg_password"
+CONNECTION_NAME = "antithesis_pg_conn"
+SOURCE_NAME = "pg_cdc_source"
+TABLE_NAME = "antithesis_pg_cdc"
+
+
+def ensure_pg_connection() -> None:
+    """Create the upstream-PG secret and connection in Materialize (idempotent)."""
+    execute_retry(
+        f"CREATE SECRET IF NOT EXISTS {SECRET_NAME} AS '{PG_SOURCE_PASSWORD}'"
+    )
+    execute_retry(
+        f"CREATE CONNECTION IF NOT EXISTS {CONNECTION_NAME} TO POSTGRES ("
+        f"HOST '{PG_SOURCE_HOST}', "
+        f"PORT {PG_SOURCE_PORT}, "
+        f"USER '{PG_SOURCE_USER}', "
+        f"PASSWORD SECRET {SECRET_NAME}, "
+        f"DATABASE '{PG_SOURCE_DATABASE}'"
+        f")"
+    )
+    LOG.info("pg connection %s ready (upstream=%s)", CONNECTION_NAME, PG_SOURCE_HOST)
+
+
+def ensure_pg_cdc_table() -> None:
+    """Create the Materialize-side reference table from the source (idempotent)."""
+    try:
+        execute_retry(
+            f"CREATE TABLE IF NOT EXISTS {TABLE_NAME} "
+            f"FROM SOURCE {SOURCE_NAME} "
+            f"(REFERENCE {UPSTREAM_SCHEMA}.{UPSTREAM_TABLE})"
+        )
+    except psycopg.errors.InternalError as exc:
+        if "already exists" not in str(exc):
+            raise
+        rows = query_retry("SELECT 1 FROM mz_tables WHERE name = %s", (TABLE_NAME,))
+        if rows:
+            LOG.info("table %s landed concurrently; tolerating collision", TABLE_NAME)
+            return
+        raise
+    LOG.info(
+        "pg cdc table %s ready (upstream=%s.%s)",
+        TABLE_NAME,
+        UPSTREAM_SCHEMA,
+        UPSTREAM_TABLE,
+    )
+
+
+def ensure_pg_cdc_source() -> None:
+    """Create the full PG CDC pipeline in Materialize (idempotent).
+
+    Requires {UPSTREAM_SCHEMA}.cdc_test and PUBLICATION antithesis_pub to
+    already exist on the upstream PG. Call first_pg_cdc_setup.py before
+    this.
+    """
+    ensure_pg_connection()
+    create_source_idempotent(
+        f"CREATE SOURCE IF NOT EXISTS {SOURCE_NAME} "
+        f"IN CLUSTER {CLUSTER} "
+        f"FROM POSTGRES CONNECTION {CONNECTION_NAME} "
+        f"(PUBLICATION '{UPSTREAM_PUBLICATION}')",
+        SOURCE_NAME,
+    )
+    LOG.info("pg cdc source %s ready", SOURCE_NAME)
+    ensure_pg_cdc_table()
diff --git a/test/antithesis/workload/test/helper_pg_upstream.py b/test/antithesis/workload/test/helper_pg_upstream.py
new file mode 100644
index 0000000000000..1e783c40a4da4
--- /dev/null
+++ b/test/antithesis/workload/test/helper_pg_upstream.py
@@ -0,0 +1,167 @@
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Upstream-Postgres connection helpers for Antithesis drivers.
+
+Talks to the `postgres-source` container — the PG instance that
+Materialize subscribes to via a logical replication slot. Separate from
+`helper_pg`, which talks to materialized itself over pgwire.
+
+All calls retry transient network and operational errors up to a fixed
+budget so the workload keeps progressing through fault-injection windows.
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+import time
+from collections.abc import Sequence
+from typing import Any
+
+import psycopg
+
+LOG = logging.getLogger("antithesis.helper_pg_upstream")
+
+PG_HOST = os.environ.get("PG_SOURCE_HOST", "postgres-source")
+PG_PORT = int(os.environ.get("PG_SOURCE_PORT", "5432"))
+PG_USER = os.environ.get("PG_SOURCE_USER", "postgres")
+PG_PASSWORD = os.environ.get("PG_SOURCE_PASSWORD", "postgres")
+PG_DATABASE = os.environ.get("PG_SOURCE_DATABASE", "postgres")
+
+# Mirrors helper_mysql / helper_pg budgets so per-attempt timeouts and
+# overall retry budget span at least one full faults-ON + faults-OFF
+# orchestrator cycle plus margin for the upstream to actually respond.
+_CONNECT_TIMEOUT_S = 30
+_RETRY_BUDGET_S = 180
+_RETRY_INITIAL_S = 0.5
+_RETRY_MAX_S = 4.0
+
+
+def _retryable(exc: BaseException) -> bool:
+    if isinstance(exc, psycopg.OperationalError):
+        return True
+    if isinstance(exc, psycopg.InterfaceError):
+        return True
+    return False
+
+
+def _open() -> psycopg.Connection:
+    """Open a single connection to the upstream PG, retrying transients."""
+    start = time.monotonic()
+    deadline = start + _RETRY_BUDGET_S
+    backoff = _RETRY_INITIAL_S
+    attempt = 0
+    LOG.debug(
+        "pg upstream connect: starting (host=%s port=%d db=%s)",
+        PG_HOST,
+        PG_PORT,
+        PG_DATABASE,
+    )
+    while True:
+        attempt += 1
+        attempt_start = time.monotonic()
+        try:
+            conn = psycopg.connect(
+                host=PG_HOST,
+                port=PG_PORT,
+                user=PG_USER,
+                password=PG_PASSWORD,
+                dbname=PG_DATABASE,
+                connect_timeout=_CONNECT_TIMEOUT_S,
+                autocommit=True,
+            )
+            LOG.info(
+                "pg upstream connect: established on attempt %d in %.2fs",
+                attempt,
+                time.monotonic() - attempt_start,
+            )
+            return conn
+        except Exception as exc:  # noqa: BLE001
+            if not _retryable(exc) or time.monotonic() > deadline:
+                LOG.warning(
+                    "pg upstream connect: giving up after %d attempts (%.2fs total): %s",
+                    attempt,
+                    time.monotonic() - start,
+                    exc,
+                )
+                raise
+            LOG.info(
+                "pg upstream connect: attempt %d failed: %s; sleeping %.2fs",
+                attempt,
+                exc,
+                backoff,
+            )
+            time.sleep(backoff)
+            backoff = min(backoff * 2, _RETRY_MAX_S)
+
+
+def execute(sql: str, params: Sequence[Any] | None = None) -> None:
+    """Execute a statement, retrying transient errors. No result returned."""
+    deadline = time.monotonic() + _RETRY_BUDGET_S
+    backoff = _RETRY_INITIAL_S
+    while True:
+        try:
+            conn = _open()
+            try:
+                with conn.cursor() as cur:
+                    cur.execute(sql, params or ())
+            finally:
+                conn.close()
+            return
+        except Exception as exc:  # noqa: BLE001
+            if not _retryable(exc) or time.monotonic() > deadline:
+                raise
+            LOG.info("pg upstream execute retrying after %s", exc)
+            time.sleep(backoff)
+            backoff = min(backoff * 2, _RETRY_MAX_S)
+
+
+def query(sql: str, params: Sequence[Any] | None = None) -> list[tuple[Any, ...]]:
+    """Run a query and return all rows, retrying transient errors."""
+    deadline = time.monotonic() + _RETRY_BUDGET_S
+    backoff = _RETRY_INITIAL_S
+    while True:
+        try:
+            conn = _open()
+            try:
+                with conn.cursor() as cur:
+                    cur.execute(sql, params or ())
+                    rows = list(cur.fetchall())
+            finally:
+                conn.close()
+            return rows
+        except Exception as exc:  # noqa: BLE001
+            if not _retryable(exc) or time.monotonic() > deadline:
+                raise
+            LOG.info("pg upstream query retrying after %s", exc)
+            time.sleep(backoff)
+            backoff = min(backoff * 2, _RETRY_MAX_S)
+
+
+def wait_until_ready(timeout_s: float = 180.0) -> None:
+    """Block until the upstream PG accepts connections."""
+    deadline = time.monotonic() + timeout_s
+    while time.monotonic() < deadline:
+        try:
+            conn = psycopg.connect(
+                host=PG_HOST,
+                port=PG_PORT,
+                user=PG_USER,
+                password=PG_PASSWORD,
+                dbname=PG_DATABASE,
+                connect_timeout=5,
+            )
+            conn.close()
+            LOG.info("pg upstream %s is ready", PG_HOST)
+            return
+        except Exception as exc:  # noqa: BLE001
+            LOG.info("waiting for pg upstream %s: %s", PG_HOST, exc)
+            time.sleep(2)
+    raise TimeoutError(f"upstream Postgres at {PG_HOST} not ready after {timeout_s}s")
diff --git a/test/antithesis/workload/test/helper_testdrive.py b/test/antithesis/workload/test/helper_testdrive.py
new file mode 100644
index 0000000000000..b39b2f729d4e4
--- /dev/null
+++ b/test/antithesis/workload/test/helper_testdrive.py
@@ -0,0 +1,534 @@
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Run repository-checked-in testdrive .td files under Antithesis.
+
+The testdrive binary is bundled into the antithesis-workload image via
+the multi-stage MZFROM in the Dockerfile. The repo's `test/pg-cdc/*.td`
+files are bundled in too (see `pre-image: copy` in mzbuild.yml) under
+/opt/testdrive-files/test/pg-cdc/.
+
+The drivers pass repo-relative file paths; this helper:
+  1. Reads the bundled file.
+  2. Strips the `$ skip-if` block — many checked-in `.td` files start
+     with `$ skip-if / SELECT true` to disable themselves pending an
+     upstream fix. Antithesis is the upstream-fix proving ground, so we
+     un-skip at runtime. The repo file stays untouched; only the
+     in-memory copy passed to testdrive is rewritten.
+  3. Writes the rewritten file to a tmp path.
+  4. Spawns testdrive with the right URLs for the antithesis topology.
+  5. Returns (exit_code, stdout, stderr) so the driver can decide
+     whether a failure is a real property violation or a transient
+     fault-injection artifact.
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+import re
+import subprocess
+import tempfile
+from dataclasses import dataclass
+
+import helper_pg
+import helper_pg_upstream
+
+LOG = logging.getLogger("antithesis.helper_testdrive")
+
+# Names that the data-loss workload owns. Anything else in the user-
+# visible Materialize catalog is fair game for the pre-run reset — the
+# checked-in `test/pg-cdc/*.td` files assume a clean slate and create
+# objects like `mz_source`, `pgpass`, `pg`, `storage`, `t1` without
+# guarding for IF NOT EXISTS.
+#
+# The system clusters (`mz_introspection`, `mz_system`, `mz_catalog_server`,
+# `mz_probe`) are owned by `mz_system` and rejected by the regular DROP
+# CLUSTER, so we filter on owner rather than enumerating names.
+_PRESERVE_SOURCES = {"pg_cdc_source", "mysql_cdc_source"}
+_PRESERVE_CONNECTIONS = {"antithesis_pg_conn", "antithesis_mysql_conn"}
+_PRESERVE_SECRETS = {"antithesis_pg_password", "antithesis_mysql_password"}
+_PRESERVE_CLUSTERS = {"antithesis_cluster", "quickstart"}
+# Tables under the data-loss workload's source are subsources — DROP TABLE
+# on them is rejected unless we also drop the source. The reset filters
+# them out by name; they live inside `pg_cdc_source` / `mysql_cdc_source`
+# which is preserved.
+_PRESERVE_TABLES = {"antithesis_pg_cdc", "antithesis_cdc", "antithesis_cdc_myisam"}
+
+# Pool clusters share a `pool_cluster_*` prefix.
+_PRESERVE_CLUSTER_PREFIX = "pool_cluster_"
+
+# Where the workload Dockerfile lands the bundled .td files. The
+# pre-image: copy step in mzbuild.yml preserves the repo-relative path
+# under this prefix, so callers reference files exactly the way they
+# appear in the repo.
+TESTDRIVE_FILES_ROOT = "/opt/testdrive-files"
+TESTDRIVE_BINARY = "/usr/local/bin/testdrive"
+
+# Defaults matching the antithesis topology. Overridable via env so the
+# same helper works under `snouty validate` (local docker-compose) and
+# inside Antithesis (same image, same network).
+MATERIALIZE_URL = os.environ.get(
+    "MZ_MATERIALIZE_URL", "postgres://materialize@materialized:6875"
+)
+MATERIALIZE_INTERNAL_URL = os.environ.get(
+    "MZ_MATERIALIZE_INTERNAL_URL", "postgres://mz_system@materialized:6877"
+)
+KAFKA_ADDR = os.environ.get("KAFKA_BROKER", "kafka:9092")
+SCHEMA_REGISTRY_URL = os.environ.get(
+    "SCHEMA_REGISTRY_URL", "http://schema-registry:8081"
+)
+
+# Patterns we treat as transient fault-injection artifacts rather than
+# property violations. Testdrive surfaces these as non-zero exits when an
+# Antithesis fault window happens to overlap a sensitive moment in the
+# script. The driver layer demotes these to "didn't get a clean signal,
+# try again next time" rather than firing `always(False)`.
+TRANSIENT_PATTERNS = (
+    # Network / process-down windows.
+    "connection refused",
+    "connection reset",
+    "no route to host",
+    "broken pipe",
+    "could not connect to server",
+    "Failed to resolve hostname",
+    "Temporary failure in name resolution",
+    # Materialize admission control during a restart.
+    "is (re)initializing",
+    "TooManyRequests",
+    # Postgres source visiting its own restart window.
+    "terminating connection due to administrator command",
+)
+
+# `$ skip-if` directive removal regex. testdrive parses skip-if as:
+#   $ skip-if
+#   <SQL that returns >0 rows skips the rest of the file>
+# Many checked-in regression tests use `$ skip-if / SELECT true` as a
+# manual disable. We strip *only* that specific shape — any skip-if with
+# a non-trivial query is left intact (those gate on feature flags and
+# should still gate under Antithesis).
+_SKIP_IF_TRUE_RE = re.compile(
+    r"^\$ skip-if\s*\n\s*SELECT\s+true\s*\n",
+    re.MULTILINE | re.IGNORECASE,
+)
+
+
+@dataclass
+class TestdriveResult:
+    exit_code: int
+    stdout: str
+    stderr: str
+
+    @property
+    def succeeded(self) -> bool:
+        return self.exit_code == 0
+
+    @property
+    def looks_transient(self) -> bool:
+        """True if a non-zero exit is plausibly a fault-injection artifact.
+
+        Used by the driver layer to demote transient failures to
+        `sometimes(False)` rather than fire a hard `always(False)`. The
+        match is intentionally generous — false-positive transients
+        sacrifice some signal but avoid false-positive property
+        violations, which are far costlier in triage.
+        """
+        if self.succeeded:
+            return False
+        blob = (self.stdout + "\n" + self.stderr).lower()
+        return any(p.lower() in blob for p in TRANSIENT_PATTERNS)
+
+
+def _strip_skip_if_true(content: str) -> tuple[str, bool]:
+    """Remove `$ skip-if / SELECT true` blocks from a testdrive script.
+
+    Returns (rewritten content, True if a block was removed). Other
+    `skip-if` shapes are left intact because they encode meaningful
+    feature-flag gates we still want to respect.
+    """
+    rewritten, n = _SKIP_IF_TRUE_RE.subn("", content)
+    return rewritten, n > 0
+
+
+def run(
+    td_file: str,
+    *,
+    timeout_s: float = 600.0,
+    extra_args: list[str] | None = None,
+) -> TestdriveResult:
+    """Run a bundled testdrive file. `td_file` is repo-relative, e.g.
+    "test/pg-cdc/dropped-slot-errors.td".
+
+    The repo file is read, rewritten in memory to strip a skip-if-true
+    header if present, and the rewritten copy is fed to testdrive. The
+    on-disk repo file is never modified.
+    """
+    src_path = os.path.join(TESTDRIVE_FILES_ROOT, td_file)
+    if not os.path.isfile(src_path):
+        raise FileNotFoundError(
+            f"testdrive file {src_path!r} not bundled in workload image; "
+            f"check mzbuild.yml pre-image:copy 'matching:' glob"
+        )
+
+    with open(src_path) as f:
+        content = f.read()
+    rewritten, stripped = _strip_skip_if_true(content)
+    if stripped:
+        LOG.info("td %s: stripped `$ skip-if / SELECT true` header", td_file)
+
+    # testdrive treats its positional arg as a *glob pattern* and matches
+    # it against files found by `WalkDir::new(".").sort_by_file_name()` —
+    # i.e. it walks the process cwd, not the glob's directory. Passing an
+    # absolute path like `/tmp/foo.td` therefore matches nothing under
+    # `./` and exits with "glob did not match any patterns". We work
+    # around this by writing the rewritten file into a dedicated dir and
+    # running testdrive with that dir as cwd, passing just the basename.
+    tmp_dir = tempfile.mkdtemp(prefix="td-run-", dir="/tmp")
+    tmp_name = "input.td"
+    tmp_path = os.path.join(tmp_dir, tmp_name)
+    with open(tmp_path, "w") as f:
+        f.write(rewritten)
+
+    cmd = [
+        TESTDRIVE_BINARY,
+        f"--materialize-url={MATERIALIZE_URL}",
+        f"--materialize-internal-url={MATERIALIZE_INTERNAL_URL}",
+        f"--kafka-addr={KAFKA_ADDR}",
+        f"--schema-registry-url={SCHEMA_REGISTRY_URL}",
+        "--no-reset",
+        # Per-statement default timeout; testdrive retries `>` and `!`
+        # statements internally until this expires. Long enough to span
+        # one full faults-ON+OFF orchestrator cycle (~80s default) plus
+        # margin for the statement itself.
+        "--default-timeout=180s",
+        # Vars referenced by many of the checked-in `test/pg-cdc/*.td`
+        # files (and other testdrive suites). Values mirror what
+        # `test/pg-cdc/mzcompose.py` passes when it runs the same files
+        # in CI — `scale=4,workers=4` for cluster replicas,
+        # `scale=4,workers=1` for storage. Without them, files that
+        # construct `> CREATE CLUSTER cdc_cluster SIZE
+        # '${arg.default-replica-size}'` fail at parse time with
+        # "unknown variable: arg.default-replica-size".
+        "--var=default-replica-size=scale=4,workers=4",
+        "--var=default-storage-size=scale=4,workers=1",
+        tmp_name,
+    ]
+    if extra_args:
+        cmd.extend(extra_args)
+
+    LOG.info("running testdrive (cwd=%s): %s", tmp_dir, " ".join(cmd))
+    try:
+        proc = subprocess.run(
+            cmd,
+            cwd=tmp_dir,
+            capture_output=True,
+            text=True,
+            timeout=timeout_s,
+            check=False,
+        )
+        result = TestdriveResult(
+            exit_code=proc.returncode,
+            stdout=proc.stdout,
+            stderr=proc.stderr,
+        )
+    except subprocess.TimeoutExpired as exc:
+        LOG.warning("testdrive %s timed out after %.0fs", td_file, timeout_s)
+        result = TestdriveResult(
+            exit_code=124,
+            stdout=exc.stdout.decode()
+            if isinstance(exc.stdout, bytes)
+            else (exc.stdout or ""),
+            stderr=(
+                exc.stderr.decode()
+                if isinstance(exc.stderr, bytes)
+                else (exc.stderr or "")
+            )
+            + f"\n[helper_testdrive] timed out after {timeout_s:.0f}s",
+        )
+    finally:
+        # The whole staging dir holds the rewritten file plus any tmp
+        # scratch testdrive wrote; shutil.rmtree clears both. Errors
+        # here are best-effort — the OS will reap /tmp eventually.
+        import shutil
+
+        shutil.rmtree(tmp_dir, ignore_errors=True)
+
+    LOG.info(
+        "testdrive %s exited %d (transient=%s, stdout=%d bytes, stderr=%d bytes)",
+        td_file,
+        result.exit_code,
+        result.looks_transient,
+        len(result.stdout),
+        len(result.stderr),
+    )
+    return result
+
+
+def _quote_ident(name: str) -> str:
+    """Double-quote an identifier and escape embedded double-quotes."""
+    return '"' + name.replace('"', '""') + '"'
+
+
+def reset_materialize_user_state() -> None:
+    """Drop user-visible Materialize objects not owned by our workload.
+
+    Used as both pre- and post-cleanup around testdrive runs. The
+    checked-in `test/pg-cdc/*.td` files don't guard their `CREATE
+    SECRET/CONNECTION/SOURCE/CLUSTER/TABLE` statements with `IF NOT
+    EXISTS` and don't clean up after themselves, so consecutive runs of
+    different files (or two runs of the same file) collide on names
+    like `pgpass`, `mz_source`, `storage`, `t1`. This function leaves
+    only the data-loss workload's objects in place; anything else is
+    dropped CASCADE.
+
+    Best-effort: a missing object or permission denial logs a warning
+    but doesn't raise. The next td run's own CREATE will surface any
+    real residue.
+    """
+
+    # Drop sources first — they CASCADE to their subsource tables,
+    # which lets us avoid enumerating td-created tables individually.
+    # Filtering happens in Python because mixing the preserve-set into
+    # SQL would need parameter-binding for an IN-list, which psycopg
+    # doesn't expand transparently.
+    try:
+        rows = helper_pg.query_retry("SELECT name FROM mz_sources WHERE id LIKE 'u%%'")
+        for (name,) in rows:
+            if name in _PRESERVE_SOURCES:
+                continue
+            LOG.info("reset: dropping source %s", name)
+            try:
+                helper_pg.execute_retry(
+                    f"DROP SOURCE IF EXISTS {_quote_ident(name)} CASCADE"
+                )
+            except Exception as exc:  # noqa: BLE001
+                LOG.warning("reset: drop source %s failed: %s", name, exc)
+    except Exception as exc:  # noqa: BLE001
+        LOG.warning("reset: enumerate sources failed: %s", exc)
+
+    # Drop user-owned clusters (`id LIKE 'u%'`) that aren't ours. The
+    # system clusters live under `s%` ids and would be rejected anyway.
+    try:
+        rows = helper_pg.query_retry("SELECT name FROM mz_clusters WHERE id LIKE 'u%%'")
+        for (name,) in rows:
+            if name in _PRESERVE_CLUSTERS or name.startswith(_PRESERVE_CLUSTER_PREFIX):
+                continue
+            LOG.info("reset: dropping cluster %s", name)
+            try:
+                helper_pg.execute_retry(
+                    f"DROP CLUSTER IF EXISTS {_quote_ident(name)} CASCADE"
+                )
+            except Exception as exc:  # noqa: BLE001
+                LOG.warning("reset: drop cluster %s failed: %s", name, exc)
+    except Exception as exc:  # noqa: BLE001
+        LOG.warning("reset: enumerate clusters failed: %s", exc)
+
+    # Drop user tables that aren't subsources of preserved sources
+    # (subsources show up in mz_tables with non-null source_id; those
+    # were already dropped by the source CASCADE above).
+    try:
+        rows = helper_pg.query_retry(
+            "SELECT name FROM mz_tables WHERE id LIKE 'u%%' AND source_id IS NULL"
+        )
+        for (name,) in rows:
+            if name in _PRESERVE_TABLES:
+                continue
+            LOG.info("reset: dropping table %s", name)
+            try:
+                helper_pg.execute_retry(
+                    f"DROP TABLE IF EXISTS {_quote_ident(name)} CASCADE"
+                )
+            except Exception as exc:  # noqa: BLE001
+                LOG.warning("reset: drop table %s failed: %s", name, exc)
+    except Exception as exc:  # noqa: BLE001
+        LOG.warning("reset: enumerate tables failed: %s", exc)
+
+    # Connections last — sources reference them, so they have to be
+    # gone before we can drop a connection without CASCADE-killing a
+    # preserved source.
+    try:
+        rows = helper_pg.query_retry(
+            "SELECT name FROM mz_connections WHERE id LIKE 'u%%'"
+        )
+        for (name,) in rows:
+            if name in _PRESERVE_CONNECTIONS:
+                continue
+            LOG.info("reset: dropping connection %s", name)
+            try:
+                helper_pg.execute_retry(
+                    f"DROP CONNECTION IF EXISTS {_quote_ident(name)} CASCADE"
+                )
+            except Exception as exc:  # noqa: BLE001
+                LOG.warning("reset: drop connection %s failed: %s", name, exc)
+    except Exception as exc:  # noqa: BLE001
+        LOG.warning("reset: enumerate connections failed: %s", exc)
+
+    # Secrets — last because connections reference them.
+    try:
+        rows = helper_pg.query_retry("SELECT name FROM mz_secrets WHERE id LIKE 'u%%'")
+        for (name,) in rows:
+            if name in _PRESERVE_SECRETS:
+                continue
+            LOG.info("reset: dropping secret %s", name)
+            try:
+                helper_pg.execute_retry(f"DROP SECRET IF EXISTS {_quote_ident(name)}")
+            except Exception as exc:  # noqa: BLE001
+                LOG.warning("reset: drop secret %s failed: %s", name, exc)
+    except Exception as exc:  # noqa: BLE001
+        LOG.warning("reset: enumerate secrets failed: %s", exc)
+
+    # Drop user-created schemas in the materialize database. Td files like
+    # subsource-names.td do `> CREATE SCHEMA a;` against materialize and
+    # don't clean up. Filter by database name = 'materialize' so we don't
+    # touch other databases (system, etc.), and drop only user schemas
+    # (`id LIKE 'u%'`). The default schema `public` is preserved.
+    try:
+        rows = helper_pg.query_retry(
+            "SELECT s.name FROM mz_schemas s "
+            "JOIN mz_databases d ON s.database_id = d.id "
+            "WHERE s.id LIKE 'u%%' AND d.name = 'materialize'"
+        )
+        for (name,) in rows:
+            if name == "public":
+                continue
+            LOG.info("reset: dropping schema %s", name)
+            try:
+                helper_pg.execute_retry(
+                    f"DROP SCHEMA IF EXISTS {_quote_ident(name)} CASCADE"
+                )
+            except Exception as exc:  # noqa: BLE001
+                LOG.warning("reset: drop schema %s failed: %s", name, exc)
+    except Exception as exc:  # noqa: BLE001
+        LOG.warning("reset: enumerate schemas failed: %s", exc)
+
+
+def reset_upstream_state() -> None:
+    """Wipe the upstream PG's `public` schema and any leftover publication.
+
+    Most `test/pg-cdc/*.td` files start by dropping+recreating `public`
+    themselves, but doing it here too defends against a previous run
+    that crashed mid-script and left objects behind. The data-loss
+    workload lives in `antithesis_pg_cdc`, which we never touch.
+    """
+    # Drop non-data-loss publications. testdrive scripts use names
+    # like `mz_source`, `mz_source_extra`, etc; enumerating from the
+    # catalog is more reliable than a hardcoded list. The data-loss
+    # workload's `antithesis_pub` is preserved so `parallel_driver_pg_cdc`
+    # keeps working.
+    try:
+        rows = helper_pg_upstream.query("SELECT pubname FROM pg_publication")
+        for (pubname,) in rows:
+            if pubname == "antithesis_pub":
+                continue
+            try:
+                helper_pg_upstream.execute(
+                    f'DROP PUBLICATION IF EXISTS "{pubname}"'
+                )
+            except Exception as exc:  # noqa: BLE001
+                LOG.warning(
+                    "reset upstream: drop publication %s failed: %s",
+                    pubname,
+                    exc,
+                )
+    except Exception as exc:  # noqa: BLE001
+        LOG.warning("reset upstream: enumerate publications failed: %s", exc)
+
+    # Terminate + drop any leftover logical replication slots. Materialize's
+    # source workers hold a slot open; when we `DROP SOURCE` on the MZ side
+    # the slot release races with the next test's CREATE — and a still-
+    # active slot can't be dropped, which makes `dropped-slot-errors.td`
+    # fail at `pg_drop_replication_slot` with "slot is active for PID …".
+    # Force-terminate then drop. Best-effort.
+    try:
+        rows = helper_pg_upstream.query(
+            "SELECT slot_name, active_pid FROM pg_replication_slots"
+        )
+        for slot_name, active_pid in rows:
+            if active_pid is not None:
+                try:
+                    helper_pg_upstream.execute(
+                        "SELECT pg_terminate_backend(%s)", (active_pid,)
+                    )
+                except Exception as exc:  # noqa: BLE001
+                    LOG.warning(
+                        "reset upstream: terminate slot pid %s failed: %s",
+                        active_pid,
+                        exc,
+                    )
+            try:
+                helper_pg_upstream.execute(
+                    "SELECT pg_drop_replication_slot(%s)", (slot_name,)
+                )
+            except Exception as exc:  # noqa: BLE001
+                LOG.warning(
+                    "reset upstream: drop slot %s failed: %s", slot_name, exc
+                )
+    except Exception as exc:  # noqa: BLE001
+        LOG.warning("reset upstream: enumerate replication slots failed: %s", exc)
+
+    # Drop non-system user schemas. Schemas the td files commonly create
+    # in passing — `a`, `other`, `schema1`, `conflict_schema`, etc. —
+    # need to go, but the data-loss workload's `antithesis_pg_cdc` and
+    # postgres' system schemas must stay.
+    preserve_schemas = {
+        "public",
+        "antithesis_pg_cdc",
+        "pg_catalog",
+        "information_schema",
+        "pg_toast",
+        "pg_temp_1",
+        "pg_toast_temp_1",
+    }
+    try:
+        # psycopg treats `%` as a parameter placeholder; escape with `%%`
+        # for the literal LIKE-pattern wildcard.
+        rows = helper_pg_upstream.query(
+            "SELECT nspname FROM pg_namespace "
+            "WHERE nspname NOT LIKE 'pg_%%' AND nspname <> 'information_schema'"
+        )
+        for (nspname,) in rows:
+            if nspname in preserve_schemas:
+                continue
+            try:
+                helper_pg_upstream.execute(
+                    f'DROP SCHEMA IF EXISTS "{nspname}" CASCADE'
+                )
+            except Exception as exc:  # noqa: BLE001
+                LOG.warning(
+                    "reset upstream: drop schema %s failed: %s", nspname, exc
+                )
+    except Exception as exc:  # noqa: BLE001
+        LOG.warning("reset upstream: enumerate schemas failed: %s", exc)
+
+    # Re-create `public` last — many tests assume it exists. DROP CASCADE
+    # clears everything inside it.
+    try:
+        helper_pg_upstream.execute("DROP SCHEMA IF EXISTS public CASCADE")
+        helper_pg_upstream.execute("CREATE SCHEMA public")
+    except Exception as exc:  # noqa: BLE001
+        LOG.warning("reset upstream: drop/create schema public failed: %s", exc)
+
+    # Drop non-superuser, non-system roles. `privileges.td` creates a
+    # `priv` role and doesn't clean up; left in place it fails the next
+    # CREATE USER with "already exists". Postgres + replication role
+    # names (postgres, pg_*) are preserved.
+    try:
+        rows = helper_pg_upstream.query(
+            "SELECT rolname FROM pg_roles "
+            "WHERE rolname NOT LIKE 'pg_%%' AND rolname <> 'postgres'"
+        )
+        for (rolname,) in rows:
+            try:
+                helper_pg_upstream.execute(f'DROP ROLE IF EXISTS "{rolname}"')
+            except Exception as exc:  # noqa: BLE001
+                LOG.warning(
+                    "reset upstream: drop role %s failed: %s", rolname, exc
+                )
+    except Exception as exc:  # noqa: BLE001
+        LOG.warning("reset upstream: enumerate roles failed: %s", exc)
diff --git a/test/antithesis/workload/test/parallel_driver_pg_cdc.py b/test/antithesis/workload/test/parallel_driver_pg_cdc.py
new file mode 100644
index 0000000000000..c6c1f62c5fcef
--- /dev/null
+++ b/test/antithesis/workload/test/parallel_driver_pg_cdc.py
@@ -0,0 +1,234 @@
+#!/usr/bin/env python3
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Antithesis driver for property `pg-source-no-data-loss`.
+
+Every row inserted into the upstream Postgres `public.cdc_test` must
+eventually appear — with the correct value — in the Materialize source
+that reads from it via logical replication.
+
+Each invocation:
+  1. Checks the PG CDC source exists (created by first_pg_cdc_setup).
+  2. Picks a per-invocation `batch_id` prefix so concurrent drivers don't
+     collide.
+  3. Inserts ROWS_PER_INVOCATION rows on the upstream PG, recording the
+     expected {id → value} map locally.
+  4. Polls the Materialize source table until all expected rows appear
+     (or the catchup budget expires).
+  5. Asserts correctness via `always(...)` on count and per-row values.
+     A `sometimes(...)` liveness anchor fires on successful catchup.
+
+Motivation: closes the test-coverage gap behind several open
+database-issues flakes that exercise PG CDC but currently have no
+Antithesis-side property to anchor reproductions (#9571 alter-source,
+#9931 dropped-slot-errors, #10047 SSH+CRDB metadata race).
+
+Mirrors `parallel_driver_mysql_cdc.py` so a triage report can compare
+the two source families side-by-side.
+"""
+
+from __future__ import annotations
+
+import sys
+import time
+
+import helper_logging
+import helper_pg_upstream
+import helper_random
+from helper_pg import query_retry
+from helper_pg_source import (
+    SOURCE_NAME,
+    TABLE_NAME,
+    UPSTREAM_SCHEMA,
+    UPSTREAM_TABLE,
+)
+
+from antithesis.assertions import always, sometimes
+
+LOG = helper_logging.setup_logging("driver.pg_cdc")
+
+ROWS_PER_INVOCATION = 20
+# Sized to span at least one MAX_OFF window from the global fault-
+# orchestrator (default 40s) plus the time for upstream → source → MZ
+# catchup itself, which can stretch under intermittent network faults.
+CATCHUP_TIMEOUT_S = 120.0
+POLL_INTERVAL_S = 1.0
+
+
+def _source_exists() -> bool:
+    rows = query_retry("SELECT 1 FROM mz_sources WHERE name = %s", (SOURCE_NAME,))
+    return bool(rows)
+
+
+def _insert_rows(batch_id: str) -> dict[str, str]:
+    """Insert ROWS_PER_INVOCATION rows into the upstream PG.
+
+    Returns {id → value} for every successfully inserted row.
+    """
+    expected: dict[str, str] = {}
+    for i in range(ROWS_PER_INVOCATION):
+        row_id = f"{batch_id}:{i}"
+        value = f"v{helper_random.random_int(0, 9999):04d}"
+        try:
+            helper_pg_upstream.execute(
+                f"INSERT INTO {UPSTREAM_SCHEMA}.{UPSTREAM_TABLE} "
+                f"(id, batch_id, value) VALUES (%s, %s, %s) "
+                f"ON CONFLICT (id) DO UPDATE "
+                f"SET batch_id = EXCLUDED.batch_id, value = EXCLUDED.value",
+                (row_id, batch_id, value),
+            )
+            expected[row_id] = value
+        except Exception as exc:  # noqa: BLE001
+            # Under fault injection a write to the upstream may fail. Skip
+            # the row rather than crashing so the driver keeps inserting
+            # others.
+            LOG.info("insert failed for row %s: %s; skipping", row_id, exc)
+    return expected
+
+
+def _wait_for_catchup(batch_id: str, expected_count: int) -> bool:
+    """Poll Materialize until all expected rows for `batch_id` appear.
+
+    Returns True when `COUNT(*) WHERE batch_id = ?` reaches expected_count,
+    False on timeout.
+    """
+    deadline = time.monotonic() + CATCHUP_TIMEOUT_S
+    last_seen = -1
+    while time.monotonic() < deadline:
+        try:
+            rows = query_retry(
+                f"SELECT COUNT(*)::bigint FROM {TABLE_NAME} WHERE batch_id = %s",
+                (batch_id,),
+            )
+            count = int(rows[0][0]) if rows and rows[0][0] is not None else 0
+        except Exception as exc:  # noqa: BLE001
+            LOG.info("catchup poll failed: %s; retrying", exc)
+            time.sleep(POLL_INTERVAL_S)
+            continue
+
+        if count != last_seen:
+            LOG.info(
+                "pg cdc catchup: batch=%s observed=%d target=%d",
+                batch_id,
+                count,
+                expected_count,
+            )
+            last_seen = count
+
+        if count >= expected_count:
+            return True
+        time.sleep(POLL_INTERVAL_S)
+
+    LOG.warning(
+        "pg cdc catchup timeout: batch=%s last_seen=%d target=%d",
+        batch_id,
+        last_seen,
+        expected_count,
+    )
+    return False
+
+
+def _check_rows(expected: dict[str, str]) -> None:
+    """Assert every expected row has the correct value in the Materialize source."""
+    for row_id, want in expected.items():
+        # real_time_recency: the count-based catchup above can clear at a
+        # chosen-ts that just barely satisfies the COUNT, leaving a per-row
+        # SELECT moments later to race. RTR pushes chosen-ts to the upstream
+        # PG's real-time frontier; see helper_pg.query_retry.
+        rows = query_retry(
+            f"SELECT value FROM {TABLE_NAME} WHERE id = %s",
+            (row_id,),
+            real_time_recency=True,
+        )
+        found = bool(rows)
+        observed = rows[0][0] if found else None
+        always(
+            found and observed == want,
+            "pg: CDC source row has correct value after catchup",
+            {
+                "source": TABLE_NAME,
+                "id": row_id,
+                "expected_value": want,
+                "observed_present": found,
+                "observed_value": observed,
+            },
+        )
+
+
+def main() -> int:
+    if not _source_exists():
+        # first_pg_cdc_setup must run before this driver. Outside Antithesis
+        # (e.g. snouty validate) the source may not exist yet — exit cleanly
+        # rather than erroring so validate can still proceed.
+        LOG.warning(
+            "pg cdc source %s not found; skipping "
+            "(first_pg_cdc_setup must run first)",
+            SOURCE_NAME,
+        )
+        return 0
+
+    batch_id = f"p{helper_random.random_u64():016x}"
+    LOG.info("driver starting; batch_id=%s", batch_id)
+
+    expected = _insert_rows(batch_id)
+    if not expected:
+        LOG.info("no rows inserted successfully this invocation; exiting cleanly")
+        return 0
+
+    LOG.info("inserted %d rows; waiting for catchup", len(expected))
+    caught_up = _wait_for_catchup(batch_id, len(expected))
+
+    # Liveness anchor: at least one invocation should fully catch up. If this
+    # never fires across an entire run the safety assertions below are vacuous.
+    sometimes(
+        caught_up,
+        "pg: CDC source caught up to all upstream inserts within catchup budget",
+        {
+            "source": TABLE_NAME,
+            "batch_id": batch_id,
+            "rows_inserted": len(expected),
+        },
+    )
+
+    if not caught_up:
+        # Don't run per-row safety assertions on stale data — a slow catchup
+        # is a separate concern from row-level correctness.
+        LOG.info("catchup did not complete in budget; skipping per-row assertions")
+        return 0
+
+    # Safety: every row we inserted must be present with the correct value.
+    _check_rows(expected)
+
+    # Count-level safety check: no extra rows for our batch_id should exist.
+    rows = query_retry(
+        f"SELECT COUNT(*)::bigint FROM {TABLE_NAME} WHERE batch_id = %s",
+        (batch_id,),
+        real_time_recency=True,
+    )
+    count_in_mz = int(rows[0][0]) if rows and rows[0][0] is not None else 0
+    always(
+        count_in_mz == len(expected),
+        "pg: CDC source row count matches inserted count after catchup",
+        {
+            "source": TABLE_NAME,
+            "batch_id": batch_id,
+            "expected_count": len(expected),
+            "observed_count": count_in_mz,
+        },
+    )
+
+    LOG.info(
+        "driver done; asserted on %d rows for batch_id=%s", len(expected), batch_id
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/test/antithesis/workload/test/singleton_driver_pg_cdc_testdrive.py b/test/antithesis/workload/test/singleton_driver_pg_cdc_testdrive.py
new file mode 100644
index 0000000000000..c712116960ccd
--- /dev/null
+++ b/test/antithesis/workload/test/singleton_driver_pg_cdc_testdrive.py
@@ -0,0 +1,207 @@
+#!/usr/bin/env python3
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Antithesis driver: pick a random `test/pg-cdc/*.td` file and run it.
+
+Replaces the per-file singleton drivers. The Antithesis-side coverage
+of the PG CDC source code path grows automatically as new `.td` files
+land in `test/pg-cdc/` — no driver-level edit needed.
+
+Each invocation:
+  1. Resets Materialize user state (drops all user-visible objects not
+     owned by the data-loss workload) and the upstream PG's `public`
+     schema. Defends against state leftover from a previous run that
+     crashed mid-script or from a still-running `parallel_driver_pg_cdc`.
+  2. Picks one `.td` file at random from the bundled set, excluding
+     files known to be incompatible with our topology (SSL fixtures).
+  3. Runs the file via `helper_testdrive.run` — which strips the
+     `$ skip-if / SELECT true` disable header so the test actually
+     executes, then invokes the bundled testdrive binary.
+  4. Asserts on the result.
+  5. Cleans up Materialize state again and re-creates the data-loss
+     workload's source/connection/secret/table so `parallel_driver_pg_cdc`
+     finds them on its next invocation.
+
+This is a `singleton_driver_` because almost every `.td` file under
+`test/pg-cdc/` assumes exclusive ownership of `public` schema, the
+`mz_source` publication on the upstream, and the `pgpass`/`pg`/`mz_source`
+names on the materialize side. Two concurrent runs would trample each
+other; the singleton harness primitive enforces serial execution.
+
+Property name: `pg-cdc-testdrive-suite-no-spurious-failure`. The
+assertion message is constant; the `td_file` lives in the assertion
+details so Antithesis triage reports break the result down per-file.
+"""
+
+from __future__ import annotations
+
+import os
+import sys
+
+import helper_logging
+import helper_pg_source
+import helper_random
+import helper_testdrive
+
+from antithesis.assertions import always, sometimes
+
+LOG = helper_logging.setup_logging("driver.pg_cdc_testdrive")
+
+TD_DIR = os.path.join(helper_testdrive.TESTDRIVE_FILES_ROOT, "test/pg-cdc")
+
+# Files known to fail deterministically in our topology — filtered out so
+# the random picker doesn't fire `always(False)` on guaranteed-bad picks.
+# Each entry should explain *why* the test is incompatible; revisit when
+# the underlying assumption changes.
+#
+# Setup-incompatible (would need topology changes to support):
+#
+#   pg-cdc-ssl.td, pg-cdc-ssl-ca-bundle.td — require a TLS-configured
+#     upstream PG (custom certs). `postgres-source` is plain-TCP.
+#
+#   pg-cdc.td — exercises `! CREATE CONNECTION` with bad credentials and
+#     asserts the error message contains `password authentication failed
+#     for user "no_such_user"`. Our upstream PG runs with
+#     `POSTGRES_HOST_AUTH_METHOD=trust` (set by export-compose.py's
+#     inline_postgres_setup for the Antithesis sandbox network), so the
+#     auth path returns a different error class. Setup-specific —
+#     enabling MD5 auth would mean managing real passwords in the compose,
+#     which the sandbox doesn't need.
+#
+#   subsource-resolution-duplicates.td — needs a custom `pg_hba.conf`
+#     entry to test multi-user authentication paths. Our `postgres-source`
+#     uses the stock `pg_hba.conf` (trust for all internal traffic) so
+#     the test's auth-specific assertions don't apply.
+#
+# Stale relative to current product behavior (test pre-dates a tightening):
+#
+#   replica-identity-default-nothing.td — `> CREATE TABLE … FROM SOURCE`
+#     against a table with `REPLICA IDENTITY DEFAULT` is expected to
+#     succeed, but Materialize now eagerly rejects it at source-purify
+#     time with `referenced items not tables with REPLICA IDENTITY FULL`
+#     (src/sql/src/pure/error.rs). Test is skip-if-disabled in CI
+#     pending database-issues#4231; un-skipping under Antithesis still
+#     hits the same product-level rejection regardless of schedule.
+#
+#   alter-source.td — the 412-line database-issues#9571 flake suite.
+#     Asserts very specific error-message text for `! ALTER SOURCE` and
+#     `! CREATE TABLE FROM SOURCE` paths that has drifted with the
+#     product. Failures here are real test/product divergence, not
+#     race-sensitivity Antithesis can help with — fixes need test
+#     rewrites alongside the product changes that broke them.
+_EXCLUDE_FILES: frozenset[str] = frozenset(
+    {
+        "pg-cdc-ssl.td",
+        "pg-cdc-ssl-ca-bundle.td",
+        "pg-cdc.td",
+        "replica-identity-default-nothing.td",
+        "subsource-resolution-duplicates.td",
+        "alter-source.td",
+    }
+)
+
+
+def _list_td_files() -> list[str]:
+    """Return repo-relative td paths to the bundled pg-cdc tests."""
+    if not os.path.isdir(TD_DIR):
+        LOG.warning("td dir %s missing; image may not be rebuilt", TD_DIR)
+        return []
+    entries = []
+    for name in sorted(os.listdir(TD_DIR)):
+        if not name.endswith(".td"):
+            continue
+        if name in _EXCLUDE_FILES:
+            continue
+        entries.append(f"test/pg-cdc/{name}")
+    return entries
+
+
+def main() -> int:
+    files = _list_td_files()
+    if not files:
+        LOG.warning("no pg-cdc td files bundled; exiting cleanly")
+        return 0
+
+    td_file = helper_random.random_choice(files)
+    LOG.info(
+        "picked %s from %d candidates (excluded %d)",
+        td_file,
+        len(files),
+        len(_EXCLUDE_FILES),
+    )
+
+    # Pre-run reset. Defends against residue from a prior crash. Also
+    # frees up `public` on the upstream and standard names like
+    # `pgpass`/`mz_source` on materialize so the td file's unprotected
+    # CREATEs land cleanly.
+    helper_testdrive.reset_materialize_user_state()
+    helper_testdrive.reset_upstream_state()
+
+    try:
+        result = helper_testdrive.run(td_file)
+    finally:
+        # Post-run reset regardless of outcome — even on failure we want
+        # the SUT in a known-clean state for the next driver.
+        helper_testdrive.reset_materialize_user_state()
+        helper_testdrive.reset_upstream_state()
+        # Restore the data-loss workload's PG CDC pipeline. The reset
+        # above didn't touch our antithesis_pg_* objects, but the
+        # upstream schema reset wiped `antithesis_pg_cdc` if the td file
+        # somehow touched it (none should — they all live in `public`).
+        # Re-running ensure_pg_cdc_source is idempotent; this is
+        # defense in depth.
+        try:
+            helper_pg_source.ensure_pg_cdc_source()
+        except Exception as exc:  # noqa: BLE001
+            LOG.warning("post-run ensure_pg_cdc_source failed: %s", exc)
+
+    # Safety: under Antithesis fault injection, a testdrive run on any
+    # pg-cdc test file must either succeed or fail with a recognized
+    # transient marker. A non-transient failure means a `>` or `!`
+    # checkpoint inside the test disagreed with the SUT — i.e. a real
+    # property violation surfaced by the schedule Antithesis explored.
+    clean_or_transient = result.succeeded or result.looks_transient
+    always(
+        clean_or_transient,
+        "pg-cdc: testdrive script doesn't fail with non-transient error "
+        "under Antithesis fault injection",
+        {
+            "td_file": td_file,
+            "exit_code": result.exit_code,
+            "looks_transient": result.looks_transient,
+            "stdout_tail": result.stdout[-1500:],
+            "stderr_tail": result.stderr[-1500:],
+        },
+    )
+
+    # Liveness: at least sometimes, on at least some file, the suite
+    # runs cleanly. If this never fires the safety assertion is
+    # vacuously satisfied by transient demotion.
+    sometimes(
+        result.succeeded,
+        "pg-cdc: testdrive script runs cleanly under Antithesis",
+        {
+            "td_file": td_file,
+            "exit_code": result.exit_code,
+        },
+    )
+
+    LOG.info(
+        "pg-cdc testdrive %s: exit=%d transient=%s clean_or_transient=%s",
+        td_file,
+        result.exit_code,
+        result.looks_transient,
+        clean_or_transient,
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From 4cab4f148dd2ce22758492e00849e2bd54306f0c Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Fri, 15 May 2026 16:59:54 -0400
Subject: [PATCH 64/65] test/antithesis: local-dev (non-antithesis) build/up
 via make build-local + make up-local

The antithesis-flavored mzbuild path needs libvoidstar.so for the
Rust linker, which isn't published for aarch64 and isn't on developer
machines by default. Adds a parallel path that builds + runs the
same compose topology natively on host arch without --antithesis.

- export-env.py / export-compose.py: --no-antithesis flag flips the
  Repository to host-arch + plain-flavor. CI passes neither, so the
  default behavior is unchanged.
- export-compose.py: also sets pull_policy:never per service for any
  mzbuild-resolved image so docker compose doesn't HEAD-probe GHCR for
  fingerprints that only exist locally on the dev machine.
- Makefile: new make build-local + make up-local targets call the
  scripts with --no-antithesis and acquire images without the flag.
  Original make build / up are unchanged.
---
 test/antithesis/Makefile          | 80 +++++++++++++++++++++++++++----
 test/antithesis/export-compose.py | 39 +++++++++++++--
 test/antithesis/export-env.py     | 20 +++++++-
 3 files changed, 127 insertions(+), 12 deletions(-)

diff --git a/test/antithesis/Makefile b/test/antithesis/Makefile
index 878bf7e384019..db16f58c565a3 100644
--- a/test/antithesis/Makefile
+++ b/test/antithesis/Makefile
@@ -16,12 +16,21 @@
 # spec.
 #
 # Targets:
-#   make build    # regenerate compose YAML, acquire local mzbuild images
-#   make up       # build + bring up the stack
-#   make down     # tear down (preserves volumes)
-#   make smoke    # build + up + smoke test
-#   make test     # smoke test against a running stack
-#   make clean    # tear down + remove volumes
+#   make build         # regenerate compose YAML, acquire local mzbuild images
+#   make up            # build + bring up the stack
+#   make down          # tear down (preserves volumes)
+#   make smoke         # build + up + smoke test
+#   make test          # smoke test against a running stack
+#   make clean         # tear down + remove volumes
+#
+#   make build-local   # build for local dev (no --antithesis flavor)
+#   make up-local      # build-local + bring up the stack
+#
+# The `-local` targets are for validating the workload + drivers without
+# the Antithesis platform. They build the plain (non-antithesis-flavored)
+# images, which (a) don't need libvoidstar.so locally and (b) cover all
+# images including new transitive deps (e.g. testdrive) that CI doesn't
+# yet publish under the antithesis flavor.
 
 SHELL := /usr/bin/env bash
 .SHELLFLAGS := -eu -o pipefail -c
@@ -65,11 +74,66 @@ export-env:
 	@echo "Wrote $(ENV_FILE)"
 
 acquire-images:
+	@# Force `--arch x86_64` to match what `export-env.py` writes into the
+	@# `.env` file. The Antithesis platform itself runs amd64-only — both
+	@# `export-env.py` and `export-compose.py` pin `arch=Arch.X86_64` — so
+	@# the fingerprints baked into the compose YAML are always for x86_64.
+	@# Without this flag, `bin/mzimage acquire` defaults to the host arch
+	@# (aarch64 on Apple Silicon), producing a different fingerprint than
+	@# the one the compose YAML references; the resulting image doesn't
+	@# match the compose's `image:` tag and the local stack fails to pull.
+	@# Also: aarch64 cross-compile of `--antithesis` builds needs an aarch64
+	@# `libvoidstar.so` which isn't published — x86_64 is the only flavor
+	@# Antithesis ships.
+	@for image in $(MZBUILD_IMAGES); do \
+	  echo "--- Acquiring $$image (--antithesis --arch x86_64)"; \
+	  cd $(REPO_ROOT) && bin/mzimage acquire "$$image" --antithesis --arch x86_64; \
+	done
+
+# ---------------------------------------------------------------------------
+# Local (non-antithesis) targets
+# ---------------------------------------------------------------------------
+#
+# Build and run the same compose topology without the Antithesis flavor.
+# Used for validating the workload + drivers locally before pushing to CI.
+# Plain (non-antithesis) mzbuild images:
+#   * don't need libvoidstar.so installed in the cross-sysroot
+#   * cover all transitive deps (e.g. testdrive), unlike the antithesis
+#     flavor which CI only publishes for materialized + antithesis-workload
+#     + antithesis-config.
+# The fault-orchestrator service is a no-op outside Antithesis (its
+# pause_faults.sh exits cleanly when ANTITHESIS_STOP_FAULTS is unset), so
+# the topology behaves like a regular docker-compose stack.
+
+.PHONY: build-local export-compose-local export-env-local acquire-images-local up-local
+
+build-local: export-compose-local export-env-local acquire-images-local
+
+export-compose-local:
+	cd $(REPO_ROOT) && bin/pyactivate test/antithesis/export-compose.py \
+	    --no-antithesis > $(COMPOSE_FILE)
+	@echo "Wrote $(COMPOSE_FILE) (host arch)"
+
+export-env-local:
+	cd $(REPO_ROOT) && bin/pyactivate test/antithesis/export-env.py \
+	    --no-antithesis > $(ENV_FILE)
+	@echo "Wrote $(ENV_FILE) (non-antithesis)"
+
+acquire-images-local:
+	@# Use the host arch (no `--arch` flag) so the resulting workload image
+	@# runs natively. On Apple Silicon, running the x86_64 testdrive binary
+	@# under Docker's rosetta/qemu emulation segfaults inside the
+	@# foundationdb client init — native aarch64 sidesteps that entirely.
+	@# `export-env.py --no-antithesis` mirrors the same logic and emits
+	@# host-arch fingerprints to the .env file.
 	@for image in $(MZBUILD_IMAGES); do \
-	  echo "--- Acquiring $$image (--antithesis)"; \
-	  cd $(REPO_ROOT) && bin/mzimage acquire "$$image" --antithesis; \
+	  echo "--- Acquiring $$image (plain, host arch)"; \
+	  cd $(REPO_ROOT) && bin/mzimage acquire "$$image"; \
 	done
 
+up-local: build-local
+	$(COMPOSE) up -d
+
 # ---------------------------------------------------------------------------
 # Up / Down
 # ---------------------------------------------------------------------------
diff --git a/test/antithesis/export-compose.py b/test/antithesis/export-compose.py
index 3d8a471641df6..b2e68321b16d3 100644
--- a/test/antithesis/export-compose.py
+++ b/test/antithesis/export-compose.py
@@ -42,6 +42,7 @@
         > test/antithesis/config/docker-compose.yaml
 """
 
+import argparse
 import sys
 from pathlib import Path
 from typing import Any
@@ -90,10 +91,25 @@
 
 
 def resolve_mzbuild(svc: dict[str, Any]) -> None:
-    """Replace `mzbuild:` with a concrete or templated `image:` ref."""
+    """Replace `mzbuild:` with a concrete or templated `image:` ref.
+
+    For Materialize-built images we also set `pull_policy: never` so the
+    `make up-local` flow doesn't attempt a registry probe at compose
+    startup. The fingerprint tags only exist locally on the dev machine
+    that ran `make build-local` — they're never pushed to GHCR by that
+    flow, so the standard "check remote for newer digest" probe fails
+    with `unauthorized` and aborts the bring-up. Third-party images
+    (PUBLIC_FALLBACKS) genuinely come from upstream registries; for
+    those we leave the default pull policy alone.
+
+    The Antithesis platform itself uses a separate registry (Antithesis's
+    GCP Artifact Registry) that it does have credentials for, so the
+    pull_policy never field doesn't affect a real Antithesis run.
+    """
     name = svc.pop("mzbuild")
     if name in MATERIALIZE_IMAGES:
         svc["image"] = MATERIALIZE_IMAGES[name]
+        svc["pull_policy"] = "never"
     elif name in PUBLIC_FALLBACKS:
         svc["image"] = PUBLIC_FALLBACKS[name]
     else:
@@ -338,16 +354,33 @@ def register_referenced_named_volumes(compose: dict[str, Any]) -> None:
 
 
 def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__.splitlines()[0])
+    parser.add_argument(
+        "--no-antithesis",
+        action="store_true",
+        help=(
+            "Emit a compose YAML for local-dev (host arch) rather than the "
+            "Antithesis x86_64 platform. Mirrors `export-env.py --no-antithesis` "
+            "— together they let `make build-local` + `make up-local` run "
+            "the stack natively on Apple Silicon (the Antithesis-flavored "
+            "x86_64 testdrive binary segfaults inside Docker's rosetta/qemu "
+            "emulation)."
+        ),
+    )
+    args = parser.parse_args()
+    arch = Arch.host() if args.no_antithesis else Arch.X86_64
+    platform = "linux/amd64" if arch == Arch.X86_64 else "linux/arm64"
+
     # munge_services=False keeps ports bare (e.g., `6875` instead of
     # `127.0.0.1::6875`) — Antithesis is container-to-container, no host
     # binding. We do our own mzbuild→image substitution below and don't
     # need fingerprint resolution since Materialize-built images become
     # `${...}` placeholders.
-    repo = Repository(Path("."), arch=Arch.X86_64, antithesis=True)
+    repo = Repository(Path("."), arch=arch, antithesis=not args.no_antithesis)
     c = Composition(repo, "antithesis", munge_services=False)
 
     for name, svc in c.compose["services"].items():
-        svc["platform"] = "linux/amd64"
+        svc["platform"] = platform
         if "mzbuild" in svc:
             resolve_mzbuild(svc)
         inline_postgres_setup(svc)
diff --git a/test/antithesis/export-env.py b/test/antithesis/export-env.py
index 5488a0f097673..c7611d100436f 100644
--- a/test/antithesis/export-env.py
+++ b/test/antithesis/export-env.py
@@ -59,9 +59,27 @@ def main() -> None:
             "default `spec()` (GHCR when MZ_GHCR=1, else Docker Hub)."
         ),
     )
+    parser.add_argument(
+        "--no-antithesis",
+        action="store_true",
+        help=(
+            "Emit non-antithesis-flavored image fingerprints. Used by the "
+            "`make build-local` workflow that brings the compose up without "
+            "the Antithesis platform — the antithesis flavor needs a "
+            "libvoidstar.so we don't have locally, and the antithesis-only "
+            "deps (testdrive in particular) aren't published with the "
+            "antithesis flavor yet. CI sets neither (antithesis stays on)."
+        ),
+    )
     args = parser.parse_args()
 
-    repo = Repository(Path("."), arch=Arch.X86_64, antithesis=True)
+    # Antithesis itself runs amd64-only, so the Antithesis-targeted build
+    # (CI default) is always x86_64. For local-dev `--no-antithesis` we use
+    # the host arch instead so the compose stack runs natively without
+    # rosetta/qemu emulation (which segfaults inside testdrive on Apple
+    # Silicon).
+    arch = Arch.host() if args.no_antithesis else Arch.X86_64
+    repo = Repository(Path("."), arch=arch, antithesis=not args.no_antithesis)
     images = [repo.images[name] for name in ENV_VARS.values()]
     deps = repo.resolve_dependencies(images)
 

From 7df5d9658db8e9f9b8ff3eeadf95cacff24411fb Mon Sep 17 00:00:00 2001
From: Dov Alperin <dov.alperin@materialize.com>
Date: Fri, 15 May 2026 18:41:49 -0400
Subject: [PATCH 65/65] test/antithesis: drivers targeting SinceViolation bug
 family (#11200 + #11224)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds two new parallel_driver_ shapes that collectively cover both
peek-sequencing variants of the read-hold downgrade bug:

  parallel_driver_explicit_txn_no_since_violation
    * Property: peek-no-since-violation
    * Each invocation: 1 autocommit write into its own seed-private
      antithesis_txn_table, then a fresh connection running
      BEGIN -> 8 SELECTs alternating table/MV -> COMMIT.
    * Targets the new peek sequencing (frontend_peek.rs) via explicit
      multi-statement transactions — the in_immediate_multi_stmt_txn=true
      code path the #11224 fix (#36403) addressed.
    * Low per-MV pressure, broad invocation-level concurrency.

  parallel_driver_pw_hot_objects
    * Properties: peek-no-since-violation (same as above) +
      prepared-execute-no-since-violation (new — old peek sequencing)
    * Every invocation hammers the SHARED pw_hot_table + pw_hot_mv_count
      + pw_hot_mv_sum + pw_hot_idx_count set for 10s with a 6-action
      weighted mix (INSERT 30, PREPARE+EXECUTE 40, explicit-txn 15,
      baseline SELECT 10, DELETE-old 5).
    * Targets the old peek sequencing (#11200) via PREPARE+EXECUTE on
      an indexed MV under continuous frontier advance from concurrent
      writes. Existing parallel-workload's exe_prepared runs against
      fragmented per-seed objects (~1/4 of CI nightly's per-MV
      pressure) — the hot driver concentrates all invocations' writes
      on one shared MV so the bug window has a real chance to open.
    * Storage growth bounded by time-based DELETE
      (ts < now() - INTERVAL '5 minutes').

Why two drivers, not one: peek-no-since-violation and
prepared-execute-no-since-violation share the error matcher but are
distinct properties — old and new peek sequencing are different
code paths that happen to share a bug shape, and PR #36403 only
fixed one of them. Keeping the assertion messages distinct lets
Antithesis triage tell us which path regressed.

Also bundles helper_pw_hot.py with the shared object-name constants
and the SINCE_VIOLATION_PATTERNS / TRANSIENT_PATTERNS classifiers
that both drivers consume.
---
 .../scratchbook/property-catalog.md           |  24 ++
 .../workload/test/first_explicit_txn_setup.py |  88 +++++
 .../test/first_pw_hot_objects_setup.py        | 146 +++++++
 .../antithesis/workload/test/helper_pw_hot.py |  98 +++++
 ..._driver_explicit_txn_no_since_violation.py | 288 ++++++++++++++
 .../test/parallel_driver_pw_hot_objects.py    | 368 ++++++++++++++++++
 6 files changed, 1012 insertions(+)
 create mode 100644 test/antithesis/workload/test/first_explicit_txn_setup.py
 create mode 100644 test/antithesis/workload/test/first_pw_hot_objects_setup.py
 create mode 100644 test/antithesis/workload/test/helper_pw_hot.py
 create mode 100644 test/antithesis/workload/test/parallel_driver_explicit_txn_no_since_violation.py
 create mode 100644 test/antithesis/workload/test/parallel_driver_pw_hot_objects.py

diff --git a/test/antithesis/scratchbook/property-catalog.md b/test/antithesis/scratchbook/property-catalog.md
index 39804f51a1edc..3d30128a3c39e 100644
--- a/test/antithesis/scratchbook/property-catalog.md
+++ b/test/antithesis/scratchbook/property-catalog.md
@@ -159,6 +159,30 @@ Properties that verify correctness under concurrent access patterns within the c
 | **Antithesis Angle** | Inject timing delays in the source operator between command channel invocations. Stress the sync_activator bridge between sync and async contexts. Antithesis explores whether worker scheduling variations cause reordering. |
 | **Why It Matters** | Command reordering causes workers to diverge, producing inconsistent dataflow results. The code explicitly acknowledges this is unguaranteed. Surfaced by: Concurrency. |
 
+### peek-no-since-violation — Explicit-Txn Reads Never Surface a SinceViolation Error
+
+| | |
+|---|---|
+| **Type** | Safety |
+| **Priority** | P1 — direct regression target for database-issues#11224 ("Various queries sometimes end up with insufficient read holds") and its peers (#9510, #10011, #10050, #11200, ...) |
+| **Status** | **Implemented (workload-side, two drivers)** — (1) `test/antithesis/workload/test/parallel_driver_explicit_txn_no_since_violation.py` + `first_explicit_txn_setup.py`: each invocation does one autocommit write to its own `antithesis_txn_table`, then runs `BEGIN -> 8 SELECTs alternating table/MV -> COMMIT`. Low per-MV pressure, broad invocation-level concurrency. (2) `test/antithesis/workload/test/parallel_driver_pw_hot_objects.py` + `first_pw_hot_objects_setup.py`: every invocation hammers the **shared** `pw_hot_table` + `pw_hot_mv_count` + `pw_hot_mv_sum` set, including an explicit-txn action that exercises the same property against a high-contention MV. The two drivers split the coverage axis — one covers breadth of invocations on private state, the other covers depth of contention on shared state. Both pattern-match the same `SINCE_VIOLATION_PATTERNS` (defined in `helper_pw_hot.py`) and fire the same assertion message so triage aggregates. `sometimes(True)` liveness on clean commits prevents vacuous-pass. |
+| **Property** | A read issued inside an explicit `BEGIN…COMMIT` transaction must never error with a since/read-hold mismatch (`SinceViolation`, `as_of not beyond since`, `insufficient read holds`). Materialize's transaction model stores read holds acquired by the first non-AS-OF query and reuses them for subsequent queries in the txn; the stored holds must remain valid through the entire txn lifetime. |
+| **Invariant** | `Always`: every error returned from a SELECT inside the explicit txn either matches a recognized transient pattern or is *not* a since/read-hold mismatch. `Sometimes`: the full `BEGIN..COMMIT` cycle commits cleanly at least once per run. |
+| **Antithesis Angle** | The bug needs a timing window between (a) the txn's first query acquiring read holds for the timedomain and (b) a subsequent query in the same txn validating its as_of against those holds — if the storage/compute controller's background frontier-advance work downgrades the stored holds in between, validation fails. Antithesis amplifies the window via clusterd/materialized restarts mid-txn, pauses on the timestamp oracle, and concurrent inserts from other invocations that keep the source frontier moving. **Relationship to parallel-workload**: the bug surfaces in *two* code paths — the **new** peek sequencing (`frontend_peek.rs`, used by direct SELECT inside `BEGIN..COMMIT`, fixed in PR #36403) and the **old** peek sequencing (still used by `EXECUTE` of prepared statements, `sequence_plan → sequence_execute → handle_execute`). `parallel_driver_parallel_workload`'s `exe_prepared` does `PREPARE … EXECUTE foo … DEALLOCATE`, which exercises the old path — that's the surface CI nightly has flaked on (e.g. database-issues#11200). This driver targets the **new** path via explicit transactions; the two are complementary, not redundant. |
+| **Why It Matters** | Customer SELECTs from a `BEGIN..COMMIT` block are not a corner case — they're the default behavior of any ORM, BI tool, or "transactional read" pattern. A SinceViolation surfaced to such a client is a hard query failure with no obvious workaround. The bug has flaked across at least five CI occurrences (#11224 lists them) over several months. PR #36403 fixes the new-peek-sequencing variant; this property is the regression net for that fix. |
+
+### prepared-execute-no-since-violation — `PREPARE…EXECUTE` Never Surfaces a SinceViolation (Old Peek Sequencing)
+
+| | |
+|---|---|
+| **Type** | Safety |
+| **Priority** | P1 — direct target for database-issues#11200, the parallel-workload CI flake on `EXECUTE foo` returning `peek timestamp is not beyond the since of collection: u237`. |
+| **Status** | **Implemented (workload-side)** — `test/antithesis/workload/test/parallel_driver_pw_hot_objects.py` does `PREPARE q_<batch>_<i> AS SELECT … FROM pw_hot_mv_count; EXECUTE q_<batch>_<i>; DEALLOCATE q_<batch>_<i>` against the **indexed** MV at a high rate while concurrent invocations INSERT and DELETE on the underlying table (driving frontier advance). Two prepared-statement shapes are exercised: a single-collection `SELECT FROM pw_hot_mv_count` (indexed, hits the `peek_target.id()` validation at `controller.rs:892`) and a multi-collection `SELECT FROM pw_hot_mv_sum, pw_hot_table` (larger `input_id_bundle`, hits the stored-holds-subset path). Errors matching `SINCE_VIOLATION_PATTERNS` fire `always(False)` with `op: prepared-execute-on-hot-objects` in the details so triage can separate this from the explicit-txn variant. |
+| **Property** | A SELECT executed via `PREPARE` + `EXECUTE` must never error with a since/read-hold mismatch. The portal-driven `EXECUTE` path goes through the **old** peek sequencing (`sequence_plan → sequence_execute → handle_execute`) — a code path that is distinct from `frontend_peek.rs` and was not addressed by PR #36403. The same root bug shape (re-resolve read holds by ID, get an already-downgraded view) lives on this side until the old peek sequencing is retired. |
+| **Invariant** | `Always`: errors from `EXECUTE foo` either match a recognized transient pattern or are not a since-related read-hold error. `Sometimes`: at least one `PREPARE…EXECUTE` cycle completes cleanly per invocation. |
+| **Antithesis Angle** | Antithesis's container-level pauses + the driver's own concurrent INSERT/DELETE workload generate the frontier-advance churn that the bug needs. The indexed MV's peeks route through the validation site in `controller.rs:892`; the concurrent writes downgrade the source's `since` continuously; if the stored read hold on the prepared statement is re-resolved against the newer view, validation fails. **Why parallel-workload doesn't already cover this**: parallel-workload's `exe_prepared` runs against **fragmented per-seed objects** — each invocation's MV only has NUM_THREADS=4 writers in its own seed-scoped database. Per-MV pressure is roughly one quarter of CI nightly's level, which is why CI catches this bug and Antithesis hasn't (yet). The hot-objects driver concentrates all invocations' pressure on a single shared MV. |
+| **Why It Matters** | Every customer using a prepared-statement API (most ORMs, JDBC, pgx, drivers that re-use connections) routes through this code path. A SinceViolation here is a connection-fatal error for that client. database-issues#11200 has flaked on this path multiple times in CI; the fact that our Antithesis property catalog had no entry for it until now reflects exactly the per-object pressure gap this driver was built to close. |
+
 ## Category 5: Lifecycle Transitions
 
 Properties about 0DT deployment, startup, and shutdown correctness.
diff --git a/test/antithesis/workload/test/first_explicit_txn_setup.py b/test/antithesis/workload/test/first_explicit_txn_setup.py
new file mode 100644
index 0000000000000..5b2a47998a570
--- /dev/null
+++ b/test/antithesis/workload/test/first_explicit_txn_setup.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Antithesis first_ command: scaffolding for the explicit-txn driver.
+
+Creates one table and one materialized view over it, both in
+`antithesis_cluster`. The objects are `antithesis_txn_*`-prefixed so
+the testdrive-runner singleton's reset (which preserves
+`antithesis_*`-prefixed names) won't clobber them.
+
+Why a dedicated table+MV: `parallel_driver_explicit_txn_no_since_violation`
+issues `BEGIN -> SELECT ... -> SELECT ... -> COMMIT` sequences and asserts
+`always(no SinceViolation)`. A non-empty table with an MV over it is the
+smallest object set that:
+  (a) gives the explicit-txn timestamp determination two collections to
+      acquire read holds against (matching the bug's "stored read holds
+      restricted to input_id_bundle" code path), and
+  (b) gives the MV's dataflow background frontier advances that can
+      race against the txn's stored as_of — the timing window the bug
+      lives in.
+"""
+
+from __future__ import annotations
+
+import sys
+
+import helper_logging
+from helper_pg import execute_retry, query_retry
+
+from antithesis.assertions import reachable
+
+LOG = helper_logging.setup_logging("first.explicit_txn_setup")
+
+CLUSTER = "antithesis_cluster"
+TABLE_NAME = "antithesis_txn_table"
+MV_NAME = "antithesis_txn_mv"
+
+# Initial rows so the first SELECT inside an explicit txn has data to
+# read. Small — the driver issues additional INSERTs to keep frontiers
+# advancing.
+INITIAL_ROWS = 1000
+
+
+def _exists(catalog: str, name: str) -> bool:
+    rows = query_retry(f"SELECT 1 FROM {catalog} WHERE name = %s", (name,))
+    return bool(rows)
+
+
+def main() -> int:
+    if not _exists("mz_tables", TABLE_NAME):
+        LOG.info("creating %s in cluster %s", TABLE_NAME, CLUSTER)
+        execute_retry(
+            f"CREATE TABLE {TABLE_NAME} (id BIGINT PRIMARY KEY, v BIGINT NOT NULL)"
+        )
+        execute_retry(
+            f"INSERT INTO {TABLE_NAME} "
+            f"SELECT i, i * 2 FROM generate_series(1, {INITIAL_ROWS}) AS i"
+        )
+    else:
+        LOG.info("%s already exists; skipping", TABLE_NAME)
+
+    if not _exists("mz_materialized_views", MV_NAME):
+        LOG.info("creating %s in cluster %s", MV_NAME, CLUSTER)
+        execute_retry(
+            f"CREATE MATERIALIZED VIEW {MV_NAME} "
+            f"IN CLUSTER {CLUSTER} AS "
+            f"SELECT count(*) AS n, sum(v) AS s FROM {TABLE_NAME}"
+        )
+    else:
+        LOG.info("%s already exists; skipping", MV_NAME)
+
+    reachable(
+        "explicit-txn: scaffolding ready (table + MV created in antithesis_cluster)",
+        {"table": TABLE_NAME, "mv": MV_NAME, "cluster": CLUSTER},
+    )
+    LOG.info("explicit-txn scaffolding setup complete")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/test/antithesis/workload/test/first_pw_hot_objects_setup.py b/test/antithesis/workload/test/first_pw_hot_objects_setup.py
new file mode 100644
index 0000000000000..1fb3b90f593fd
--- /dev/null
+++ b/test/antithesis/workload/test/first_pw_hot_objects_setup.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python3
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Antithesis first_ command: scaffolding for the hot-objects driver.
+
+Creates one shared table, two materialized views over it, and an index
+on one of the MVs. Every invocation of
+`parallel_driver_pw_hot_objects.py` reads from and writes to this fixed
+object set — that's the whole point: concentrate per-object contention
+in one place so #11200-shape bugs (continuous-frontier-advance +
+stored-read-holds) actually have a window to fire.
+
+Object set is described in `helper_pw_hot.py`. Why each piece:
+
+  * `pw_hot_table` — TABLE. Accepts INSERTs and DELETEs from every
+    invocation. The persist frontier on this table is the thing the
+    bug needs to keep moving.
+  * `pw_hot_mv_count` — MV computing count(*) on the table. Single-input,
+    small footprint, very fast peeks. Indexed (below) to route through
+    the `peek_target.id()` validation in `controller.rs:892` — one of
+    the three SinceViolation sites.
+  * `pw_hot_mv_sum` — MV joining count/sum/count-distinct. Larger
+    input_id_bundle (when read alongside the table in explicit txns)
+    so the stored-read-holds-subset code path gets exercised.
+  * `pw_hot_idx_count` — INDEX on `pw_hot_mv_count`. Forces the MV to
+    be arranged; peeks go through the indexed-collection path.
+
+All four live in `antithesis_cluster` — the existing two-replica
+unmanaged cluster — so multi-replica peek path is exercised.
+
+Setup is idempotent: each step guards on the catalog. Safe to re-run
+on every Antithesis timeline start; concurrent runs of this script
+(unlikely, but possible if Antithesis spawns several `first_` commands
+in parallel) tolerate each other through `IF NOT EXISTS` + catalog
+re-checks. Objects are never dropped by any driver — they belong to
+the timeline.
+"""
+
+from __future__ import annotations
+
+import sys
+
+import helper_logging
+from helper_pg import execute_retry, query_retry
+from helper_pw_hot import (
+    CLUSTER,
+    INDEX_NAME,
+    MV_COUNT_NAME,
+    MV_SUM_NAME,
+    TABLE_NAME,
+)
+
+from antithesis.assertions import reachable
+
+LOG = helper_logging.setup_logging("first.pw_hot_objects_setup")
+
+# Initial seed rows — enough for the MVs to have something to compute
+# without forcing a slow initial snapshot. The driver adds writes on
+# every invocation.
+INITIAL_ROWS = 1000
+
+
+def _exists(catalog: str, name: str) -> bool:
+    rows = query_retry(f"SELECT 1 FROM {catalog} WHERE name = %s", (name,))
+    return bool(rows)
+
+
+def main() -> int:
+    if not _exists("mz_tables", TABLE_NAME):
+        LOG.info("creating %s", TABLE_NAME)
+        execute_retry(
+            f"CREATE TABLE {TABLE_NAME} ("
+            f"  id          BIGINT      PRIMARY KEY,"
+            f"  v           BIGINT      NOT NULL,"
+            f"  written_by  TEXT        NOT NULL,"
+            f"  ts          TIMESTAMPTZ NOT NULL DEFAULT now()"
+            f")"
+        )
+        execute_retry(
+            f"INSERT INTO {TABLE_NAME} (id, v, written_by) "
+            f"SELECT i, i * 2, 'initial-seed' "
+            f"FROM generate_series(1, {INITIAL_ROWS}) AS i"
+        )
+    else:
+        LOG.info("%s already exists; skipping", TABLE_NAME)
+
+    if not _exists("mz_materialized_views", MV_COUNT_NAME):
+        LOG.info("creating %s in cluster %s", MV_COUNT_NAME, CLUSTER)
+        execute_retry(
+            f"CREATE MATERIALIZED VIEW {MV_COUNT_NAME} "
+            f"IN CLUSTER {CLUSTER} AS "
+            f"SELECT count(*) AS n FROM {TABLE_NAME}"
+        )
+    else:
+        LOG.info("%s already exists; skipping", MV_COUNT_NAME)
+
+    if not _exists("mz_materialized_views", MV_SUM_NAME):
+        LOG.info("creating %s in cluster %s", MV_SUM_NAME, CLUSTER)
+        execute_retry(
+            f"CREATE MATERIALIZED VIEW {MV_SUM_NAME} "
+            f"IN CLUSTER {CLUSTER} AS "
+            f"SELECT count(*) AS n, "
+            f"       sum(v) AS s, "
+            f"       count(DISTINCT written_by) AS writers "
+            f"FROM {TABLE_NAME}"
+        )
+    else:
+        LOG.info("%s already exists; skipping", MV_SUM_NAME)
+
+    if not _exists("mz_indexes", INDEX_NAME):
+        LOG.info(
+            "creating index %s on %s in cluster %s",
+            INDEX_NAME,
+            MV_COUNT_NAME,
+            CLUSTER,
+        )
+        execute_retry(
+            f"CREATE INDEX {INDEX_NAME} "
+            f"IN CLUSTER {CLUSTER} "
+            f"ON {MV_COUNT_NAME} (n)"
+        )
+    else:
+        LOG.info("%s already exists; skipping", INDEX_NAME)
+
+    reachable(
+        "pw-hot: shared table + MVs + index scaffolding ready",
+        {
+            "table": TABLE_NAME,
+            "mvs": [MV_COUNT_NAME, MV_SUM_NAME],
+            "index": INDEX_NAME,
+            "cluster": CLUSTER,
+        },
+    )
+    LOG.info("pw-hot scaffolding setup complete")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/test/antithesis/workload/test/helper_pw_hot.py b/test/antithesis/workload/test/helper_pw_hot.py
new file mode 100644
index 0000000000000..a85e139d43481
--- /dev/null
+++ b/test/antithesis/workload/test/helper_pw_hot.py
@@ -0,0 +1,98 @@
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Shared constants for the hot-objects parallel-workload driver.
+
+`parallel_driver_pw_hot_objects` and `first_pw_hot_objects_setup` both
+reference a small fixed set of objects in `materialize.public` that
+every invocation of the hot driver writes to and reads from. Centralised
+here so the setup and the driver stay in lockstep — renaming, for
+example, the MV is a one-line change rather than two-file editing.
+
+Why hot objects exist: see `test/antithesis/scratchbook/property-catalog.md`
+entries `peek-no-since-violation` and `prepared-execute-no-since-violation`.
+Short version: the explicit-txn driver and the `parallel-workload` driver
+both fragment workload state per-invocation, which dilutes per-object
+contention to roughly one quarter of CI nightly's level. Bugs like
+database-issues#11200 need continuous frontier advance on a single MV
+across many concurrent peek paths to surface; the hot objects exist to
+concentrate that pressure.
+"""
+
+from __future__ import annotations
+
+# All hot objects live in the default database+schema so every driver
+# invocation finds them at the same fully-qualified name. They are
+# **never dropped** by any driver — they belong to the timeline, not
+# any single invocation.
+CLUSTER = "antithesis_cluster"
+TABLE_NAME = "pw_hot_table"
+MV_COUNT_NAME = "pw_hot_mv_count"
+MV_SUM_NAME = "pw_hot_mv_sum"
+INDEX_NAME = "pw_hot_idx_count"
+
+# Time-based pruning window. The hot driver issues
+#   DELETE FROM pw_hot_table WHERE ts < now() - INTERVAL '5 minutes'
+# at a low rate so the table doesn't grow unboundedly across long
+# Antithesis runs. The retention window also doubles as a "how stale
+# can a stored read hold get" knob — larger window = longer-lived rows
+# = bigger SinceViolation window if the bug fires.
+RETENTION = "INTERVAL '5 minutes'"
+
+# Error-message patterns that classify a peek failure as
+# "the read-hold-was-downgraded-under-us bug" vs anything else. Lifted
+# out of the driver so all consumers can stay in sync if Materialize
+# changes its error text.
+#
+# Pulled from the bug surface:
+#   * src/compute-client/src/controller.rs:788, 892 — SinceViolation(...)
+#   * src/storage-types/src/read_holds.rs:143    — ReadHoldDowngradeError::SinceViolation
+#   * src/persist-client/src/read.rs:680        — "since of our read handle is merely"
+#                                                  (database-issues#9510)
+#   * adapter error path                         — "insufficient read holds",
+#                                                  "as_of not beyond since",
+#                                                  "peek timestamp is not beyond the since"
+SINCE_VIOLATION_PATTERNS = (
+    "sinceviolation",
+    "as_of not beyond",
+    "as_of of",
+    "since of our read handle is merely",
+    "insufficient read holds",
+    "dataflow has an as_of not beyond",
+    "peek timestamp is not beyond the since",
+)
+
+# Errors we expect to see under Antithesis fault injection but that
+# aren't the bug: a kill of clusterd, a pause of materialized, a broker
+# blip. Demoted to `sometimes(False)` rather than `always(False)` so
+# transients don't fire false-positive property violations.
+TRANSIENT_PATTERNS = (
+    "connection refused",
+    "connection reset",
+    "server closed the connection",
+    "is (re)initializing",
+    "toomanyrequests",
+    "terminating connection due to administrator command",
+    "broken pipe",
+    "eof detected",
+)
+
+
+def matches_any(msg: str, patterns: tuple[str, ...]) -> bool:
+    """Case-insensitive substring search across a tuple of patterns."""
+    lo = msg.lower()
+    return any(p in lo for p in patterns)
+
+
+def is_since_violation(msg: str) -> bool:
+    return matches_any(msg, SINCE_VIOLATION_PATTERNS)
+
+
+def is_transient(msg: str) -> bool:
+    return matches_any(msg, TRANSIENT_PATTERNS)
diff --git a/test/antithesis/workload/test/parallel_driver_explicit_txn_no_since_violation.py b/test/antithesis/workload/test/parallel_driver_explicit_txn_no_since_violation.py
new file mode 100644
index 0000000000000..29e5ebe09fa72
--- /dev/null
+++ b/test/antithesis/workload/test/parallel_driver_explicit_txn_no_since_violation.py
@@ -0,0 +1,288 @@
+#!/usr/bin/env python3
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Antithesis driver for property `peek-no-since-violation`.
+
+Targets the bug family at database-issues#11224 ("Various queries
+sometimes end up with insufficient read holds") and its peers (#9510,
+#10011, #10050, #11200). Every occurrence in the issue carried
+`in_immediate_multi_stmt_txn: true` — i.e. the bug only fires on
+*subsequent* queries inside an explicit `BEGIN…COMMIT` transaction,
+where Materialize uses the read holds the first query acquired.
+
+The bug surfaces in two code paths that share a root cause (read holds
+re-resolved by ID returning an already-downgraded view):
+
+  - **New peek sequencing** (`frontend_peek.rs`), used by direct SELECT
+    inside a `BEGIN..COMMIT`. Fixed by PR #36403 ("thread bare
+    ReadHolds through ComputeController::peek"). **This is what this
+    driver targets.**
+  - **Old peek sequencing** (`sequence_plan → sequence_execute →
+    handle_execute`), still used by `EXECUTE` of prepared statements.
+    `parallel_driver_parallel_workload`'s `exe_prepared` does
+    `PREPARE … EXECUTE foo … DEALLOCATE` per random SELECT and is the
+    surface CI nightly has flaked on (e.g. database-issues#11200).
+
+The two drivers are complementary, not redundant — neither
+explicit-txn-SELECT nor PREPARE/EXECUTE subsumes the other code path.
+
+Each invocation:
+  1. Issues a small write against `antithesis_txn_table` (autocommit) —
+     committed writes are what advance the table's persist frontier and
+     keep the timestamp oracle moving, giving the explicit txn below
+     real frontier work to race against.
+  2. Opens a separate connection, runs an explicit transaction:
+         BEGIN
+         SELECT … FROM antithesis_txn_table    (1st query: acquires read holds)
+         SELECT … FROM antithesis_txn_mv       (subsequent: uses stored holds)
+         SELECT … FROM antithesis_txn_table
+         SELECT … FROM antithesis_txn_mv
+         COMMIT
+     The intermixed table/MV reads keep the input_id_bundle changing,
+     so the stored-holds subset / re-resolution path is exercised on
+     every statement.
+  3. If a `SinceViolation`/`as_of not beyond since`/`insufficient read
+     holds` error escapes any of the SELECTs, fires `always(False)`
+     with the offending error attached. Other errors (connection drop
+     from a clusterd kill, transient broker/admission issues) are
+     demoted to `sometimes(False)`.
+  4. Records a `sometimes(True)` liveness anchor when the whole txn
+     commits cleanly so triage can tell vacuous-pass from real coverage.
+"""
+
+from __future__ import annotations
+
+import logging
+import sys
+import time
+
+import helper_logging
+import helper_random
+import psycopg
+from helper_pg import connect
+
+from antithesis.assertions import always, sometimes
+
+LOG = helper_logging.setup_logging("driver.explicit_txn")
+
+CLUSTER = "antithesis_cluster"
+TABLE_NAME = "antithesis_txn_table"
+MV_NAME = "antithesis_txn_mv"
+
+# Pulled from the bug surface (see frontend_peek.rs +
+# compute-client/src/controller.rs around the SinceViolation returns).
+# We deliberately also match the persist-side variant the related
+# bug #9510 surfaces, so this driver doubles as the regression anchor
+# for that one — both express the same since-was-advanced-under-us
+# pattern.
+_SINCE_VIOLATION_PATTERNS = (
+    "sinceviolation",
+    "as_of not beyond",
+    "as_of of",
+    "since of our read handle is merely",
+    "insufficient read holds",
+    "dataflow has an as_of not beyond",
+)
+
+# Other errors that we treat as transient: connection dropped because
+# Antithesis killed clusterd/materialized; admission control; broker
+# blips. None of these are the bug; they're noise we expect under fault
+# injection.
+_TRANSIENT_PATTERNS = (
+    "connection refused",
+    "connection reset",
+    "server closed the connection",
+    "is (re)initializing",
+    "toomanyrequests",
+    "terminating connection due to administrator command",
+)
+
+# How many SELECTs to fire inside each explicit transaction. Big enough
+# that we cover the >1-subsequent-query case (which is where stored
+# read holds get re-resolved); small enough that Antithesis can run
+# many independent invocations concurrently.
+SELECTS_PER_TXN = 8
+
+# Bytes injected per autocommit-write to drive frontier advance. A few
+# rows per invocation, multiplied by many concurrent invocations, is
+# enough churn for the timestamp oracle to move between txn-statements.
+WRITE_ROWS_PER_INVOCATION = 5
+
+
+def _matches_any(msg: str, patterns: tuple[str, ...]) -> bool:
+    lo = msg.lower()
+    return any(p in lo for p in patterns)
+
+
+def _drive_frontier_advance(batch_id: str) -> int:
+    """Insert a handful of rows under autocommit so the table's persist
+    frontier moves while the explicit txn (next step) is in flight.
+
+    Returns the number of rows actually committed (some inserts may
+    fail under fault injection; that's fine, we just want non-zero
+    churn on aggregate across all invocations).
+    """
+    inserted = 0
+    try:
+        with connect(autocommit=True) as conn, conn.cursor() as cur:
+            for i in range(WRITE_ROWS_PER_INVOCATION):
+                # `id` carries a per-invocation high bit so concurrent
+                # invocations don't collide on the same primary key.
+                # 63-bit random id: collision odds across the whole
+                # workload lifetime are vanishingly small; plain INSERT
+                # is fine. (Materialize doesn't support
+                # `ON CONFLICT DO UPDATE` — that's a Postgres extension.)
+                row_id = helper_random.random_u64() & 0x7FFF_FFFF_FFFF_FFFF
+                try:
+                    cur.execute(
+                        f"INSERT INTO {TABLE_NAME} (id, v) VALUES (%s, %s)",
+                        (row_id, i),
+                    )
+                    inserted += 1
+                except psycopg.Error as exc:
+                    LOG.info("frontier-advance insert failed (%s); skipping", exc)
+    except Exception as exc:  # noqa: BLE001
+        LOG.info("frontier-advance: open/close failed (%s); skipping", exc)
+    return inserted
+
+
+def _run_explicit_txn(batch_id: str) -> tuple[bool, str | None]:
+    """Open a non-autocommit connection, run BEGIN -> N SELECTs -> COMMIT.
+
+    Returns (clean, error_message_or_None). `clean` is True iff every
+    SELECT inside the txn succeeded and the COMMIT landed. The caller
+    decides whether a non-clean run is a property violation
+    (SinceViolation pattern) or a transient (fault-injection noise).
+    """
+    # Use autocommit=True at the psycopg layer and send `BEGIN`
+    # explicitly. Two reasons: (a) the wire conversation matches what
+    # `in_immediate_multi_stmt_txn` actually checks on Materialize's
+    # side — that flag toggles on the explicit `BEGIN`, not on
+    # psycopg's implicit-txn-from-non-autocommit behavior; (b) it
+    # lets us be exact about transaction boundaries in logs and
+    # avoid the implicit-rollback-on-error landmine when something
+    # inside the txn raises.
+    try:
+        with connect(autocommit=True) as conn, conn.cursor() as cur:
+            cur.execute("BEGIN")
+            try:
+                for i in range(SELECTS_PER_TXN):
+                    if i % 2 == 0:
+                        cur.execute(
+                            f"SELECT count(*) FROM {TABLE_NAME} WHERE id > %s",
+                            (i,),
+                        )
+                    else:
+                        cur.execute(f"SELECT n, s FROM {MV_NAME}")
+                    cur.fetchall()
+                cur.execute("COMMIT")
+            except psycopg.Error:
+                # Roll back so the connection isn't left in a wedged
+                # txn state. ROLLBACK against a connection that already
+                # aborted is a no-op error; ignore.
+                try:
+                    cur.execute("ROLLBACK")
+                except psycopg.Error:
+                    pass
+                raise
+        return True, None
+    except psycopg.Error as exc:
+        return False, str(exc)
+    except Exception as exc:  # noqa: BLE001
+        # Non-psycopg errors (e.g. the connection helper itself gave
+        # up after its retry budget) are treated as transients too.
+        return False, str(exc)
+
+
+def main() -> int:
+    batch_id = f"t{helper_random.random_u64():016x}"
+    LOG.info("driver starting; batch_id=%s", batch_id)
+
+    # Step 1: drive frontier advance.
+    inserted = _drive_frontier_advance(batch_id)
+    LOG.info("batch=%s inserted %d frontier-advance rows", batch_id, inserted)
+
+    # Optional: very small sleep so the timestamp oracle has a moment
+    # to publish the new frontier before the next connection opens.
+    # Antithesis fault injection will stretch or compress this window
+    # arbitrarily anyway; the sleep is a baseline, not a guarantee.
+    time.sleep(0.05)
+
+    # Step 2: run the explicit transaction; capture verdict.
+    clean, err = _run_explicit_txn(batch_id)
+
+    if clean:
+        sometimes(
+            True,
+            "explicit-txn: full BEGIN..SELECTs..COMMIT cycle completes cleanly",
+            {"batch_id": batch_id, "selects": SELECTS_PER_TXN},
+        )
+        # Safety side fires a trivially-true `always` so the assertion
+        # site exists in the catalog even on no-fault runs; Antithesis
+        # uses presence-of-firing as coverage signal.
+        always(
+            True,
+            "peek: explicit-txn reads never surface a SinceViolation or "
+            "since-related read-hold error",
+            {"batch_id": batch_id, "verdict": "clean"},
+        )
+        LOG.info("batch=%s txn clean", batch_id)
+        return 0
+
+    assert err is not None
+    if _matches_any(err, _SINCE_VIOLATION_PATTERNS):
+        # The bug. Surface as a property violation with the offending
+        # message so triage can correlate against #11224 et al.
+        always(
+            False,
+            "peek: explicit-txn reads never surface a SinceViolation or "
+            "since-related read-hold error",
+            {
+                "batch_id": batch_id,
+                "verdict": "since_violation",
+                "error": err[:1500],
+            },
+        )
+        LOG.warning("batch=%s SINCE VIOLATION caught: %s", batch_id, err[:200])
+        return 0
+
+    if _matches_any(err, _TRANSIENT_PATTERNS):
+        sometimes(
+            False,
+            "explicit-txn: full BEGIN..SELECTs..COMMIT cycle completes cleanly",
+            {
+                "batch_id": batch_id,
+                "verdict": "transient",
+                "error": err[:500],
+            },
+        )
+        LOG.info("batch=%s txn transient: %s", batch_id, err[:200])
+        return 0
+
+    # Unknown error class: not a SinceViolation, not a recognized
+    # transient. Record it as transient (to avoid false-positive
+    # property violations) but log loud enough to surface in triage.
+    sometimes(
+        False,
+        "explicit-txn: full BEGIN..SELECTs..COMMIT cycle completes cleanly",
+        {
+            "batch_id": batch_id,
+            "verdict": "unknown",
+            "error": err[:500],
+        },
+    )
+    logging.getLogger("driver.explicit_txn").warning(
+        "batch=%s txn unknown-error: %s", batch_id, err[:300]
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/test/antithesis/workload/test/parallel_driver_pw_hot_objects.py b/test/antithesis/workload/test/parallel_driver_pw_hot_objects.py
new file mode 100644
index 0000000000000..4a7337b93f986
--- /dev/null
+++ b/test/antithesis/workload/test/parallel_driver_pw_hot_objects.py
@@ -0,0 +1,368 @@
+#!/usr/bin/env python3
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+"""Antithesis driver: high-contention workload against shared hot objects.
+
+Closes the per-object-pressure gap between Antithesis and CI nightly's
+parallel-workload runs. Targets database-issues#11200 (`EXECUTE foo`
+SinceViolation, **old** peek sequencing) and the broader #11224 family.
+
+Every invocation runs a tight action-mix loop against the
+`pw_hot_table` / `pw_hot_mv_count` / `pw_hot_mv_sum` set created by
+`first_pw_hot_objects_setup`. Two assertion sites fire from this driver:
+
+  * `peek-no-since-violation` — same message as the explicit-txn driver
+    so Antithesis aggregates; details say `op=explicit-txn`.
+  * `prepared-execute-no-since-violation` — distinct message, the
+    old-peek-sequencing variant via `PREPARE … EXECUTE … DEALLOCATE`.
+    Bugs here probably outlive a fix to the new-peek-sequencing variant
+    so keeping the two assertion sites separate lets triage tell us
+    *which* path is regressing.
+
+Action mix (weights chosen to maximise per-MV pressure while keeping
+the loop fault-tolerant):
+
+  30  INSERT into pw_hot_table          (drives source frontier advance)
+  25  PREPARE+EXECUTE on pw_hot_mv_count (old peek sequencing path)
+  15  PREPARE+EXECUTE on pw_hot_mv_sum + table  (larger input bundle)
+  15  BEGIN; SELECT mv; SELECT table; COMMIT   (new peek sequencing path)
+  10  Single-shot SELECT                 (baseline happy path)
+   5  DELETE old rows                    (frontier retreat + storage bound)
+
+Connection: one autocommit connection per invocation, reused for every
+action in the loop (PREPAREd statements are connection-scoped). The
+explicit-txn action sends `BEGIN`/`COMMIT` on the wire — psycopg's
+autocommit knob is intentionally not toggled, so the wire conversation
+matches what Materialize's `in_immediate_multi_stmt_txn` flag tracks.
+"""
+
+from __future__ import annotations
+
+import sys
+import time
+
+import helper_logging
+import helper_pw_hot
+import helper_random
+import psycopg
+from helper_pg import connect
+from helper_pw_hot import (
+    INDEX_NAME,  # noqa: F401  -- exposed for triage details
+    MV_COUNT_NAME,
+    MV_SUM_NAME,
+    RETENTION,
+    TABLE_NAME,
+)
+
+from antithesis.assertions import always, sometimes
+
+LOG = helper_logging.setup_logging("driver.pw_hot")
+
+# Total wallclock budget per invocation. Long enough to fire many
+# PREPAREs and DELETEs; short enough that Antithesis can interleave
+# many invocations across a run.
+RUNTIME_S = 10.0
+
+# Each action's relative weight in the picker. Tuples of (name, weight).
+# Names are also the assertion-detail tag — keep them grep-friendly.
+_ACTIONS: tuple[tuple[str, int], ...] = (
+    ("insert", 30),
+    ("prepared_count", 25),
+    ("prepared_join", 15),
+    ("explicit_txn", 15),
+    ("select_baseline", 10),
+    ("delete_old", 5),
+)
+
+
+def _pick_action(rng_total: int) -> str:
+    """Weighted random action pick. `rng_total` is `sum(w)` precomputed."""
+    pick = helper_random.random_int(1, rng_total)
+    cumulative = 0
+    for name, weight in _ACTIONS:
+        cumulative += weight
+        if pick <= cumulative:
+            return name
+    return _ACTIONS[-1][0]  # unreachable; defensive
+
+
+# Verdict tags for assertion details. Kept aligned with the two
+# property entries in the catalog.
+_VERDICT_CLEAN = "clean"
+_VERDICT_TRANSIENT = "transient"
+_VERDICT_SINCE_VIOLATION = "since_violation"
+_VERDICT_UNKNOWN = "unknown"
+
+
+def _classify(err: str) -> str:
+    if helper_pw_hot.is_since_violation(err):
+        return _VERDICT_SINCE_VIOLATION
+    if helper_pw_hot.is_transient(err):
+        return _VERDICT_TRANSIENT
+    return _VERDICT_UNKNOWN
+
+
+# ---------------------------------------------------------------------------
+# Action implementations
+# ---------------------------------------------------------------------------
+#
+# Each action takes (cur, batch_id, op_idx) and either completes or raises
+# `psycopg.Error`. The driver loop catches the error, decides whether
+# to fire `always(False)`, and continues.
+
+
+def _act_insert(cur: psycopg.Cursor, batch_id: str, op_idx: int) -> None:
+    row_id = helper_random.random_u64() & 0x7FFF_FFFF_FFFF_FFFF
+    cur.execute(
+        f"INSERT INTO {TABLE_NAME} (id, v, written_by) VALUES (%s, %s, %s)",
+        (row_id, op_idx, batch_id),
+    )
+
+
+def _act_delete_old(cur: psycopg.Cursor, batch_id: str, op_idx: int) -> None:
+    # Time-based pruning — rows older than the retention window go.
+    # Each invocation issues this at low rate so storage is bounded
+    # without making the DELETE the dominant frontier-advance signal.
+    cur.execute(f"DELETE FROM {TABLE_NAME} WHERE ts < now() - {RETENTION}")
+
+
+def _act_select_baseline(cur: psycopg.Cursor, batch_id: str, op_idx: int) -> None:
+    cur.execute(f"SELECT n FROM {MV_COUNT_NAME} LIMIT 1")
+    cur.fetchall()
+
+
+def _act_prepared_count(cur: psycopg.Cursor, batch_id: str, op_idx: int) -> None:
+    """The #11200-targeting path: PREPARE+EXECUTE on the indexed MV.
+
+    Uses an invocation-and-op unique statement name so concurrent
+    invocations don't race on the same name (the connection is per-
+    invocation, so cross-invocation collision is impossible anyway, but
+    op-level uniqueness within the same connection matters when the
+    DEALLOCATE below didn't run because the EXECUTE raised).
+    """
+    stmt = f"q_{batch_id}_{op_idx}"
+    cur.execute(f"PREPARE {stmt} AS SELECT n FROM {MV_COUNT_NAME}")
+    try:
+        cur.execute(f"EXECUTE {stmt}")
+        cur.fetchall()
+    finally:
+        try:
+            cur.execute(f"DEALLOCATE {stmt}")
+        except psycopg.Error:
+            pass
+
+
+def _act_prepared_join(cur: psycopg.Cursor, batch_id: str, op_idx: int) -> None:
+    """Like _act_prepared_count but with a multi-collection input bundle.
+
+    The EXECUTE's `input_id_bundle` includes both the MV and the
+    underlying table — exercises the stored-holds-subset path in
+    `frontend_peek.rs` even though this action goes through the old
+    sequencing.
+    """
+    stmt = f"qj_{batch_id}_{op_idx}"
+    cur.execute(
+        f"PREPARE {stmt} AS "
+        f"SELECT mv.n, mv.s, t.v "
+        f"FROM {MV_SUM_NAME} mv, {TABLE_NAME} t "
+        f"WHERE t.id = (SELECT max(id) FROM {TABLE_NAME})"
+    )
+    try:
+        cur.execute(f"EXECUTE {stmt}")
+        cur.fetchall()
+    finally:
+        try:
+            cur.execute(f"DEALLOCATE {stmt}")
+        except psycopg.Error:
+            pass
+
+
+def _act_explicit_txn(cur: psycopg.Cursor, batch_id: str, op_idx: int) -> None:
+    """BEGIN -> 4 SELECTs alternating table/MV -> COMMIT.
+
+    This is the new-peek-sequencing path. Shares the property with
+    `parallel_driver_explicit_txn_no_since_violation`, but runs against
+    the hot objects which have far higher write pressure than the
+    explicit-txn driver's own scaffold.
+    """
+    cur.execute("BEGIN")
+    try:
+        for i in range(4):
+            if i % 2 == 0:
+                cur.execute(
+                    f"SELECT count(*) FROM {TABLE_NAME} WHERE id > %s",
+                    (i,),
+                )
+            else:
+                cur.execute(f"SELECT n, s FROM {MV_SUM_NAME}")
+            cur.fetchall()
+        cur.execute("COMMIT")
+    except psycopg.Error:
+        try:
+            cur.execute("ROLLBACK")
+        except psycopg.Error:
+            pass
+        raise
+
+
+_ACTION_FNS = {
+    "insert": _act_insert,
+    "delete_old": _act_delete_old,
+    "select_baseline": _act_select_baseline,
+    "prepared_count": _act_prepared_count,
+    "prepared_join": _act_prepared_join,
+    "explicit_txn": _act_explicit_txn,
+}
+
+# Which assertion site does each action's failure feed into. Actions
+# with no entry (i.e. INSERT/DELETE) don't surface SinceViolation by
+# design, so any error from them goes to the generic transient/unknown
+# bucket without firing a property assertion.
+_ACTION_ASSERTION = {
+    "select_baseline": "peek-no-since-violation",
+    "prepared_count": "prepared-execute-no-since-violation",
+    "prepared_join": "prepared-execute-no-since-violation",
+    "explicit_txn": "peek-no-since-violation",
+}
+
+
+def _fire_property(
+    site: str, ok: bool, batch_id: str, action: str, err: str | None
+) -> None:
+    """Fire the appropriate `always` for a failing peek-side action.
+
+    `ok=True` records a clean invocation (trivially-true `always`) so
+    the assertion site exists in Antithesis's catalog regardless of
+    whether a violation ever fires. `ok=False` is the property
+    violation itself.
+    """
+    if site == "peek-no-since-violation":
+        always(
+            ok,
+            "peek: explicit-txn reads never surface a SinceViolation or "
+            "since-related read-hold error",
+            {
+                "batch_id": batch_id,
+                "action": action,
+                "op": "explicit-txn-on-hot-objects",
+                **({"error": err[:1500]} if err else {}),
+            },
+        )
+    elif site == "prepared-execute-no-since-violation":
+        always(
+            ok,
+            "peek: PREPARE+EXECUTE on indexed MV never surfaces a "
+            "SinceViolation or since-related read-hold error "
+            "(old peek sequencing)",
+            {
+                "batch_id": batch_id,
+                "action": action,
+                "op": "prepared-execute-on-hot-objects",
+                **({"error": err[:1500]} if err else {}),
+            },
+        )
+
+
+def main() -> int:
+    batch_id = f"h{helper_random.random_u64():016x}"
+    LOG.info("driver starting; batch_id=%s runtime=%.1fs", batch_id, RUNTIME_S)
+
+    weight_total = sum(w for _, w in _ACTIONS)
+    deadline = time.monotonic() + RUNTIME_S
+
+    counts: dict[str, int] = {name: 0 for name, _ in _ACTIONS}
+    transient_count = 0
+    unknown_count = 0
+    since_violation_count = 0
+    clean_count = 0
+
+    try:
+        with connect(autocommit=True) as conn, conn.cursor() as cur:
+            op_idx = 0
+            while time.monotonic() < deadline:
+                action = _pick_action(weight_total)
+                counts[action] += 1
+                fn = _ACTION_FNS[action]
+                try:
+                    fn(cur, batch_id, op_idx)
+                    clean_count += 1
+                    # Fire the trivial-true `always` so the assertion
+                    # site shows up in Antithesis's catalog on clean
+                    # runs (otherwise it only appears on violations).
+                    site = _ACTION_ASSERTION.get(action)
+                    if site is not None:
+                        _fire_property(site, True, batch_id, action, None)
+                except psycopg.Error as exc:
+                    err = str(exc)
+                    verdict = _classify(err)
+                    site = _ACTION_ASSERTION.get(action)
+                    if verdict == _VERDICT_SINCE_VIOLATION:
+                        since_violation_count += 1
+                        if site is not None:
+                            _fire_property(site, False, batch_id, action, err)
+                        LOG.warning(
+                            "batch=%s SINCE VIOLATION on %s: %s",
+                            batch_id,
+                            action,
+                            err[:200],
+                        )
+                    elif verdict == _VERDICT_TRANSIENT:
+                        transient_count += 1
+                        LOG.info(
+                            "batch=%s transient on %s: %s",
+                            batch_id,
+                            action,
+                            err[:200],
+                        )
+                    else:
+                        unknown_count += 1
+                        LOG.warning(
+                            "batch=%s unknown-error on %s: %s",
+                            batch_id,
+                            action,
+                            err[:300],
+                        )
+                op_idx += 1
+    except Exception as exc:  # noqa: BLE001
+        # Helper-level fault (connection budget exhausted etc.). Counts
+        # as a transient run — we never even got to a peek assertion site.
+        LOG.info("batch=%s connection setup/loop failed: %s", batch_id, exc)
+        transient_count += 1
+
+    # Liveness anchor: at least one action completed cleanly. If this
+    # never fires across the whole run, the `always(True)` sites are
+    # vacuously safe and the property catalog entries lose meaning.
+    sometimes(
+        clean_count > 0,
+        "pw-hot: at least one action completed cleanly per invocation",
+        {
+            "batch_id": batch_id,
+            "clean_actions": clean_count,
+            "transient_errors": transient_count,
+            "unknown_errors": unknown_count,
+            "since_violations": since_violation_count,
+            "action_counts": counts,
+        },
+    )
+
+    LOG.info(
+        "batch=%s done: clean=%d transient=%d unknown=%d since_violation=%d actions=%s",
+        batch_id,
+        clean_count,
+        transient_count,
+        unknown_count,
+        since_violation_count,
+        counts,
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())