From 4170e4368e1d639cdf257f81734eae16aa66caf6 Mon Sep 17 00:00:00 2001 From: Mitch Wagner Date: Wed, 6 May 2026 15:25:19 -0400 Subject: [PATCH 01/65] feat: working local antithesis build --- antithesis/AGENTS.md | 15 + antithesis/Makefile | 104 ++++++ antithesis/config/docker-compose.yaml | 320 ++++++++++++++++++ antithesis/scratchbook/bug-candidates.md | 161 +++++++++ antithesis/scratchbook/deployment-topology.md | 157 +++++++++ antithesis/scratchbook/existing-assertions.md | 37 ++ .../catalog-recovery-consistency.md | 33 ++ .../properties/command-channel-ordering.md | 28 ++ .../compute-replica-epoch-isolation.md | 25 ++ .../critical-reader-fence-linearization.md | 24 ++ .../properties/deployment-lag-detection.md | 26 ++ .../properties/deployment-promotion-safety.md | 26 ++ .../epoch-fencing-prevents-split-brain.md | 35 ++ .../properties/fault-recovery-exercised.md | 28 ++ .../properties/group-commit-toctou-safety.md | 28 ++ .../idempotent-write-under-indeterminate.md | 28 ++ .../properties/mv-reflects-source-updates.md | 32 ++ .../properties/peek-lifecycle-exactly-once.md | 35 ++ .../properties/persist-cas-monotonicity.md | 34 ++ .../properties/source-ingestion-progress.md | 27 ++ .../storage-command-replay-idempotent.md | 28 ++ .../properties/strict-serializable-reads.md | 34 ++ .../properties/tombstone-sealing-finality.md | 22 ++ antithesis/scratchbook/property-catalog.md | 217 ++++++++++++ .../scratchbook/property-relationships.md | 56 +++ antithesis/scratchbook/sut-analysis.md | 217 ++++++++++++ test/antithesis/export-compose.py | 58 ++++ test/antithesis/mzcompose.py | 88 +++++ test/antithesis/workload/Dockerfile | 34 ++ test/antithesis/workload/mzbuild.yml | 1 + test/antithesis/workload/setup-complete.sh | 22 ++ .../workload/test/anytime_health_check.sh | 19 ++ .../workload/workload-entrypoint.sh | 16 + 33 files changed, 2015 insertions(+) create mode 100644 antithesis/AGENTS.md create mode 100644 antithesis/Makefile create mode 100644 antithesis/config/docker-compose.yaml create mode 100644 antithesis/scratchbook/bug-candidates.md create mode 100644 antithesis/scratchbook/deployment-topology.md create mode 100644 antithesis/scratchbook/existing-assertions.md create mode 100644 antithesis/scratchbook/properties/catalog-recovery-consistency.md create mode 100644 antithesis/scratchbook/properties/command-channel-ordering.md create mode 100644 antithesis/scratchbook/properties/compute-replica-epoch-isolation.md create mode 100644 antithesis/scratchbook/properties/critical-reader-fence-linearization.md create mode 100644 antithesis/scratchbook/properties/deployment-lag-detection.md create mode 100644 antithesis/scratchbook/properties/deployment-promotion-safety.md create mode 100644 antithesis/scratchbook/properties/epoch-fencing-prevents-split-brain.md create mode 100644 antithesis/scratchbook/properties/fault-recovery-exercised.md create mode 100644 antithesis/scratchbook/properties/group-commit-toctou-safety.md create mode 100644 antithesis/scratchbook/properties/idempotent-write-under-indeterminate.md create mode 100644 antithesis/scratchbook/properties/mv-reflects-source-updates.md create mode 100644 antithesis/scratchbook/properties/peek-lifecycle-exactly-once.md create mode 100644 antithesis/scratchbook/properties/persist-cas-monotonicity.md create mode 100644 antithesis/scratchbook/properties/source-ingestion-progress.md create mode 100644 antithesis/scratchbook/properties/storage-command-replay-idempotent.md create mode 100644 antithesis/scratchbook/properties/strict-serializable-reads.md create mode 100644 antithesis/scratchbook/properties/tombstone-sealing-finality.md create mode 100644 antithesis/scratchbook/property-catalog.md create mode 100644 antithesis/scratchbook/property-relationships.md create mode 100644 antithesis/scratchbook/sut-analysis.md create mode 100644 test/antithesis/export-compose.py create mode 100644 test/antithesis/mzcompose.py create mode 100644 test/antithesis/workload/Dockerfile create mode 100644 test/antithesis/workload/mzbuild.yml create mode 100755 test/antithesis/workload/setup-complete.sh create mode 100755 test/antithesis/workload/test/anytime_health_check.sh create mode 100755 test/antithesis/workload/workload-entrypoint.sh diff --git a/antithesis/AGENTS.md b/antithesis/AGENTS.md new file mode 100644 index 0000000000000..ff80e8994fb67 --- /dev/null +++ b/antithesis/AGENTS.md @@ -0,0 +1,15 @@ +This directory contains files relevant to running tests in Antithesis. + +Use the `antithesis-setup` skill to scaffold and manage this directory. Use the `antithesis-research` skill to analyze the system and build a property catalog. Use the `antithesis-workload` skill to implement assertions and test commands. + +**setup-complete.sh** +Inject this script into a Dockerfile to notify Antithesis that setup is complete. This script should only run once the system under test is ready for testing. Antithesis will not run any test commands until it receives this event. + +**config** +This directory contains the `docker-compose.yaml` file used to bring up this system within the Antithesis environment, along with any closely related config files. + +**scratchbook** +This directory is the Antithesis scratchbook for the codebase. It contains documents such as system analysis, property catalogs, topology plans, per-property evidence files (in `scratchbook/properties/`), property relationship maps, and other persistent integration notes. Keep it up to date as Antithesis-related decisions change. + +**test** +This directory contains test templates. A test template is a directory containing test command executable files. Each test command must have a valid prefix: `parallel_driver_, singleton_driver_, serial_driver_, first_, eventually_, finally_, anytime_`. Prefixes constrain when and how commands are composed in a single timeline. Files or subdirectories prefixed with `helper_` are ignored by Test Composer and can be used for helper scripts kept alongside the commands. diff --git a/antithesis/Makefile b/antithesis/Makefile new file mode 100644 index 0000000000000..d29e795d22be7 --- /dev/null +++ b/antithesis/Makefile @@ -0,0 +1,104 @@ +# Build / run helper for the Materialize Antithesis harness. +# +# Usage: +# make build # build every local image +# make up # export compose, build, bring up the stack +# make test # smoke test against the running cluster +# make push # push locally-built images to Antithesis registry +# make down # tear down (preserves volumes) +# make clean # tear down + remove volumes + images +# make smoke # full cycle: build → up → test + +SHELL := /usr/bin/env bash +.SHELLFLAGS := -eu -o pipefail -c + +PROJECT := materialize +REPO_ROOT := $(realpath $(dir $(lastword $(MAKEFILE_LIST)))/..) + +ifndef RUNTIME + RUNTIME := $(shell command -v podman >/dev/null 2>&1 && echo podman || (command -v docker >/dev/null 2>&1 && echo docker || echo none)) +endif +ifeq ($(RUNTIME),none) + $(error neither podman nor docker found in PATH; set RUNTIME=docker or install podman) +endif + +COMPOSE := $(RUNTIME) compose -p $(PROJECT) -f config/docker-compose.yaml +PSQL := $(COMPOSE) exec materialized psql -h localhost -p 6875 -U materialize + +REGISTRY ?= us-central1-docker.pkg.dev +REGISTRY_PATH ?= /molten-verve-216720/materialize-repository + +# --------------------------------------------------------------------------- +# Export — generate the resolved docker-compose YAML for Antithesis. +# --------------------------------------------------------------------------- +.PHONY: export-compose +export-compose: + cd $(REPO_ROOT) && $(REPO_ROOT)/bin/pyactivate test/antithesis/export-compose.py > antithesis/config/docker-compose.yaml + @echo "Wrote config/docker-compose.yaml" + +# --------------------------------------------------------------------------- +# Build — build images that don't have public equivalents. +# --------------------------------------------------------------------------- +LOCAL_IMAGES := workload +BUILD_TARGETS := $(LOCAL_IMAGES:%=build-%) + +.PHONY: build $(BUILD_TARGETS) +build: $(BUILD_TARGETS) + +$(BUILD_TARGETS): build-%: + $(RUNTIME) build \ + --platform linux/amd64 \ + -t $(PROJECT)-$*:latest \ + $(REPO_ROOT)/test/antithesis/$* + +# --------------------------------------------------------------------------- +# Up / Down +# --------------------------------------------------------------------------- +.PHONY: up +up: export-compose build + $(COMPOSE) up -d + +.PHONY: down +down: + $(COMPOSE) down + +# --------------------------------------------------------------------------- +# Test — quick smoke test against the running cluster +# --------------------------------------------------------------------------- +.PHONY: test +test: + $(PSQL) -c "CREATE TABLE IF NOT EXISTS smoke_test (k INT, v TEXT)" + $(PSQL) -c "INSERT INTO smoke_test VALUES (1, 'hello'), (2, 'world')" + $(PSQL) -c "SELECT * FROM smoke_test ORDER BY k" + $(PSQL) -c "DROP TABLE smoke_test" + +# --------------------------------------------------------------------------- +# Push — tag local images and push to the Antithesis registry +# --------------------------------------------------------------------------- +.PHONY: push +push: + @$(RUNTIME) images --format '{{.Repository}}:{{.Tag}}' \ + | grep ':latest$$' | grep '^\(localhost/\)\?$(PROJECT)-' \ + | while read item; do \ + nametag="$${item#localhost/}"; \ + name="$${nametag%:*}"; \ + remote="$(REGISTRY)$(REGISTRY_PATH)/$${name}:latest"; \ + echo "Pushing $${item} -> $${remote}"; \ + $(RUNTIME) tag "$${item}" "$${remote}" || exit 1; \ + $(RUNTIME) push "$${remote}" || exit 1; \ + done + +# --------------------------------------------------------------------------- +# Clean +# --------------------------------------------------------------------------- +.PHONY: clean +clean: down + $(COMPOSE) down -v --remove-orphans 2>/dev/null || true + -$(RUNTIME) rmi $$($(RUNTIME) images --format '{{.Repository}}:{{.Tag}}' | grep '^$(PROJECT)-' || true) 2>/dev/null + +# --------------------------------------------------------------------------- +# Smoke — full cycle: build → up → test +# --------------------------------------------------------------------------- +.PHONY: smoke +smoke: up test + @echo "[smoke] passed" diff --git a/antithesis/config/docker-compose.yaml b/antithesis/config/docker-compose.yaml new file mode 100644 index 0000000000000..6eb68d6f7e789 --- /dev/null +++ b/antithesis/config/docker-compose.yaml @@ -0,0 +1,320 @@ +services: + postgres-metadata: + command: + - postgres + - -c + - wal_level=logical + - -c + - max_wal_senders=100 + - -c + - max_replication_slots=100 + - -c + - max_connections=5000 + ports: + - '26257' + environment: + - POSTGRESDB=postgres + - POSTGRES_PASSWORD=postgres + - LD_PRELOAD=libeatmydata.so + - PGPORT=26257 + - POSTGRES_HOST_AUTH_METHOD=trust + healthcheck: + test: + - CMD + - pg_isready + - -U + - postgres + interval: 1s + start_period: 30s + restart: 'no' + volumes: + - ../../misc/postgres/setup_materialize.sql:/docker-entrypoint-initdb.d/z_setup_materialize.sql + platform: linux/amd64 + image: postgres:17.7 + minio: + entrypoint: + - sh + - -c + command: + - mkdir -p /data/persist && minio server /data --console-address :9001 + ports: + - 9000 + - 9001 + environment: + - MINIO_STORAGE_CLASS_STANDARD=EC:0 + - MINIO_HEAL_DISABLE=on + - MINIO_DISK_WATERMARK_LOW=1 + - MINIO_DISK_WATERMARK_HIGH=1 + healthcheck: + test: + - CMD + - curl + - --fail + - http://localhost:9000/minio/health/live + timeout: 5s + interval: 1s + start_period: 30s + platform: linux/amd64 + image: minio/minio:latest + redpanda: + image: redpandadata/redpanda:v25.2.11 + ports: + - 9092 + - 8081 + command: + - redpanda + - start + - --overprovisioned + - --smp=1 + - --memory=1G + - --reserve-memory=0M + - --node-id=0 + - --check=false + - --set + - redpanda.enable_transactions=true + - --set + - redpanda.enable_idempotence=true + - --set + - redpanda.auto_create_topics_enabled=True + - --set + - redpanda.topic_memory_per_partition=4096 + - --set + - --advertise-kafka-addr=kafka:9092 + networks: + default: + aliases: + - kafka + - schema-registry + healthcheck: + test: + - CMD + - curl + - -f + - localhost:9644/v1/status/ready + interval: 1s + start_period: 120s + platform: linux/amd64 + materialized: + hostname: materialized + depends_on: + minio: + condition: service_started + postgres-metadata: + condition: service_healthy + command: + - --unsafe-mode + - --environment-id=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0 + - --persist-blob-url=s3://minioadmin:minioadmin@persist/persist?endpoint=http://minio:9000/®ion=minio + - --orchestrator-process-propagate-crashes + - --persist-consensus-url=postgres://root@postgres-metadata:26257?options=--search_path=consensus + - --orchestrator-process-tcp-proxy-listen-addr=0.0.0.0 + - --orchestrator-process-prometheus-service-discovery-directory=/mzdata/prometheus + ports: + - 6875 + - 6876 + - 6877 + - 6878 + - 6880 + - 6881 + - 26257 + environment: + - MZ_NO_TELEMETRY=1 + - MZ_NO_BUILTIN_CONSOLE=1 + - MZ_EAT_MY_DATA=1 + - MZ_TEST_ONLY_DUMMY_SEGMENT_CLIENT=true + - MZ_SOFT_ASSERTIONS=1 + - MZ_ORCHESTRATOR_PROCESS_TCP_PROXY_LISTEN_ADDR=0.0.0.0 + - MZ_ORCHESTRATOR_PROCESS_PROMETHEUS_SERVICE_DISCOVERY_DIRECTORY=/mzdata/prometheus + - MZ_BOOTSTRAP_ROLE=materialize + - MZ_INTERNAL_PERSIST_PUBSUB_LISTEN_ADDR=0.0.0.0:6879 + - MZ_PERSIST_PUBSUB_URL=http://127.0.0.1:6879 + - MZ_AWS_CONNECTION_ROLE_ARN=arn:aws:iam::123456789000:role/MaterializeConnection + - MZ_EXTERNAL_LOGIN_PASSWORD_MZ_SYSTEM=password + - MZ_AWS_EXTERNAL_ID_PREFIX=eb5cb59b-e2fe-41f3-87ca-d2176a495345 + - MZ_CATALOG_STORE=persist + - MZ_LOG_FILTER + - CLUSTERD_LOG_FILTER + - 'MZ_CLUSTER_REPLICA_SIZES={"bootstrap": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "1", "disabled": false, "disk_limit": null, "is_cc": + true, "memory_limit": "4 GiB", "scale": 1, "workers": 1}, "scale=2,workers=4": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "8", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale": + 2, "workers": 4}, "scale=1,workers=1,legacy": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "1", "disabled": false, "disk_limit": null, "is_cc": + false, "memory_limit": "4 GiB", "scale": 1, "workers": 1}, "scale=1,workers=2,legacy": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "2", "disabled": + false, "disk_limit": null, "is_cc": false, "memory_limit": "4 GiB", "scale": + 1, "workers": 2}, "free": {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": + "1", "disabled": true, "disk_limit": null, "is_cc": true, "memory_limit": "4 + GiB", "scale": 1, "workers": 1}, "scale=1,workers=1": {"cpu_exclusive": false, + "cpu_limit": null, "credits_per_hour": "1", "disabled": false, "disk_limit": + null, "is_cc": true, "memory_limit": "4 GiB", "scale": 1, "workers": 1}, "scale=1,workers=1,mem=4GiB": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "1", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale": + 1, "workers": 1}, "scale=1,workers=1,mem=8GiB": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "1", "disabled": false, "disk_limit": null, "is_cc": + true, "memory_limit": "8 GiB", "scale": 1, "workers": 1}, "scale=1,workers=1,mem=16GiB": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "1", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "16 GiB", "scale": + 1, "workers": 1}, "scale=1,workers=1,mem=32GiB": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "1", "disabled": false, "disk_limit": null, "is_cc": + true, "memory_limit": "32 GiB", "scale": 1, "workers": 1}, "scale=1,workers=1,mem=1GiB": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "1", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "1 GiB", "scale": + 1, "workers": 1}, "scale=1,workers=2": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "2", "disabled": false, "disk_limit": null, "is_cc": + true, "memory_limit": "4 GiB", "scale": 1, "workers": 2}, "scale=1,workers=2,mem=4GiB": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "2", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale": + 1, "workers": 2}, "scale=1,workers=2,mem=8GiB": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "2", "disabled": false, "disk_limit": null, "is_cc": + true, "memory_limit": "8 GiB", "scale": 1, "workers": 2}, "scale=1,workers=2,mem=16GiB": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "2", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "16 GiB", "scale": + 1, "workers": 2}, "scale=1,workers=2,mem=32GiB": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "2", "disabled": false, "disk_limit": null, "is_cc": + true, "memory_limit": "32 GiB", "scale": 1, "workers": 2}, "scale=2,workers=1": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "2", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale": + 2, "workers": 1}, "scale=2,workers=2": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "4", "disabled": false, "disk_limit": null, "is_cc": + true, "memory_limit": "4 GiB", "scale": 2, "workers": 2}, "scale=1,workers=2,mem=2GiB": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "2", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "2 GiB", "scale": + 1, "workers": 2}, "scale=1,workers=4": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "4", "disabled": false, "disk_limit": null, "is_cc": + true, "memory_limit": "4 GiB", "scale": 1, "workers": 4}, "scale=1,workers=4,mem=4GiB": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "4", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale": + 1, "workers": 4}, "scale=1,workers=4,mem=8GiB": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "4", "disabled": false, "disk_limit": null, "is_cc": + true, "memory_limit": "8 GiB", "scale": 1, "workers": 4}, "scale=1,workers=4,mem=16GiB": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "4", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "16 GiB", "scale": + 1, "workers": 4}, "scale=1,workers=4,mem=32GiB": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "4", "disabled": false, "disk_limit": null, "is_cc": + true, "memory_limit": "32 GiB", "scale": 1, "workers": 4}, "scale=4,workers=1": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "4", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale": + 4, "workers": 1}, "scale=4,workers=4": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "16", "disabled": false, "disk_limit": null, "is_cc": + true, "memory_limit": "4 GiB", "scale": 4, "workers": 4}, "scale=1,workers=8": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "8", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale": + 1, "workers": 8}, "scale=1,workers=8,mem=4GiB": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "8", "disabled": false, "disk_limit": null, "is_cc": + true, "memory_limit": "4 GiB", "scale": 1, "workers": 8}, "scale=1,workers=8,mem=8GiB": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "8", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "8 GiB", "scale": + 1, "workers": 8}, "scale=1,workers=8,mem=16GiB": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "8", "disabled": false, "disk_limit": null, "is_cc": + true, "memory_limit": "16 GiB", "scale": 1, "workers": 8}, "scale=1,workers=8,mem=32GiB": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "8", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "32 GiB", "scale": + 1, "workers": 8}, "scale=8,workers=1": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "8", "disabled": false, "disk_limit": null, "is_cc": + true, "memory_limit": "4 GiB", "scale": 8, "workers": 1}, "scale=8,workers=8": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "64", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale": + 8, "workers": 8}, "scale=1,workers=16": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "16", "disabled": false, "disk_limit": null, "is_cc": + true, "memory_limit": "4 GiB", "scale": 1, "workers": 16}, "scale=1,workers=16,mem=4GiB": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "16", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale": + 1, "workers": 16}, "scale=1,workers=16,mem=8GiB": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "16", "disabled": false, "disk_limit": null, "is_cc": + true, "memory_limit": "8 GiB", "scale": 1, "workers": 16}, "scale=1,workers=16,mem=16GiB": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "16", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "16 GiB", "scale": + 1, "workers": 16}, "scale=1,workers=16,mem=32GiB": {"cpu_exclusive": false, + "cpu_limit": null, "credits_per_hour": "16", "disabled": false, "disk_limit": + null, "is_cc": true, "memory_limit": "32 GiB", "scale": 1, "workers": 16}, "scale=16,workers=1": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "16", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale": + 16, "workers": 1}, "scale=16,workers=16": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "256", "disabled": false, "disk_limit": null, "is_cc": + true, "memory_limit": "4 GiB", "scale": 16, "workers": 16}, "scale=1,workers=32": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "32", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale": + 1, "workers": 32}, "scale=1,workers=32,mem=4GiB": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "32", "disabled": false, "disk_limit": null, "is_cc": + true, "memory_limit": "4 GiB", "scale": 1, "workers": 32}, "scale=1,workers=32,mem=8GiB": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "32", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "8 GiB", "scale": + 1, "workers": 32}, "scale=1,workers=32,mem=16GiB": {"cpu_exclusive": false, + "cpu_limit": null, "credits_per_hour": "32", "disabled": false, "disk_limit": + null, "is_cc": true, "memory_limit": "16 GiB", "scale": 1, "workers": 32}, "scale=1,workers=32,mem=32GiB": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "32", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "32 GiB", "scale": + 1, "workers": 32}, "scale=32,workers=1": {"cpu_exclusive": false, "cpu_limit": + null, "credits_per_hour": "32", "disabled": false, "disk_limit": null, "is_cc": + true, "memory_limit": "4 GiB", "scale": 32, "workers": 1}, "scale=32,workers=32": + {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "1024", "disabled": + false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale": + 32, "workers": 32}}' + - MZ_BOOTSTRAP_DEFAULT_CLUSTER_REPLICA_SIZE=bootstrap + - MZ_BOOTSTRAP_BUILTIN_SYSTEM_CLUSTER_REPLICA_SIZE=bootstrap + - MZ_BOOTSTRAP_BUILTIN_PROBE_CLUSTER_REPLICA_SIZE=bootstrap + - MZ_BOOTSTRAP_BUILTIN_SUPPORT_CLUSTER_REPLICA_SIZE=bootstrap + - MZ_BOOTSTRAP_BUILTIN_CATALOG_SERVER_CLUSTER_REPLICA_SIZE=bootstrap + - MZ_BOOTSTRAP_BUILTIN_ANALYTICS_CLUSTER_REPLICA_SIZE=bootstrap + - MZ_BOOTSTRAP_BUILTIN_SYSTEM_CLUSTER_REPLICATION_FACTOR=1 + - MZ_BOOTSTRAP_BUILTIN_PROBE_CLUSTER_REPLICATION_FACTOR=1 + - MZ_BOOTSTRAP_DEFAULT_CLUSTER_REPLICATION_FACTOR=1 + - COCKROACH_ENGINE_MAX_SYNC_DURATION_DEFAULT=120s + - COCKROACH_LOG_MAX_SYNC_DURATION=120s + - MZ_SYSTEM_PARAMETER_DEFAULT=unsafe_enable_unsafe_functions=true;allow_real_time_recency=true;constraint_based_timestamp_selection=verify;enable_compute_peek_response_stash=true;enable_0dt_deployment_panic_after_timeout=true;enable_0dt_deployment_sources=true;enable_alter_swap=true;enable_case_literal_transform=false;enable_cast_elimination=true;enable_coalesce_case_transform=true;enable_columnar_lgalloc=false;enable_columnation_lgalloc=false;enable_compute_correction_v2=true;enable_compute_logical_backpressure=true;enable_connection_validation_syntax=true;enable_copy_to_expr=true;enable_copy_from_remote=true;enable_create_table_from_source=true;enable_eager_delta_joins=true;enable_envelope_debezium_in_subscribe=true;enable_expressions_in_limit_syntax=true;enable_iceberg_sink=true;enable_introspection_subscribes=true;enable_kafka_sink_partition_by=true;enable_lgalloc=false;enable_load_generator_counter=true;enable_logical_compaction_window=true;enable_multi_worker_storage_persist_sink=true;enable_multi_replica_sources=true;enable_rbac_checks=true;enable_reduce_mfp_fusion=true;enable_refresh_every_mvs=true;enable_replacement_materialized_views=true;enable_cluster_schedule_refresh=true;enable_sql_server_source=true;enable_s3_tables_region_check=false;enable_statement_lifecycle_logging=true;enable_storage_introspection_logs=true;enable_compute_temporal_bucketing=true;enable_variadic_left_join_lowering=true;enable_worker_core_affinity=true;grpc_client_http2_keep_alive_timeout=5s;ore_overflowing_behavior=panic;unsafe_enable_table_keys=true;with_0dt_deployment_max_wait=1800s;persist_next_listen_batch_retryer_clamp=16s;persist_next_listen_batch_retryer_initial_backoff=100ms;persist_next_listen_batch_retryer_fixed_sleep=1200ms;persist_enable_arrow_lgalloc_noncc_sizes=true;persist_enable_s3_lgalloc_noncc_sizes=true;compute_correction_v2_chain_proportionality=3;compute_correction_v2_chunk_size=8192;compute_dataflow_max_inflight_bytes=134217728;compute_hydration_concurrency=2;compute_replica_expiration_offset=3d;compute_apply_column_demands=true;compute_peek_response_stash_threshold_bytes=1048576;compute_subscribe_snapshot_optimization=true;enable_compute_sync_mv_sink=true;enable_password_auth=true;enable_frontend_peek_sequencing=true;enable_frontend_subscribes=true;enable_upsert_v2=false;default_timestamp_interval=1s;force_source_table_syntax=false;persist_batch_columnar_format=structured;persist_batch_delete_enabled=true;persist_batch_structured_order=true;persist_batch_builder_structured=true;persist_batch_structured_key_lower_len=256;persist_batch_max_run_len=4;persist_catalog_force_compaction_fuel=1024;persist_catalog_force_compaction_wait=1s;persist_stats_audit_percent=100;persist_stats_audit_panic=true;persist_encoding_enable_dictionary=true;persist_fast_path_limit=1000;persist_fast_path_order=true;persist_gc_use_active_gc=true;persist_gc_min_versions=16;persist_gc_max_versions=128000;persist_inline_writes_single_max_bytes=4096;persist_inline_writes_total_max_bytes=1048576;persist_pubsub_client_enabled=true;persist_pubsub_push_diff_enabled=true;persist_record_compactions=true;persist_record_schema_id=true;persist_rollup_use_active_rollup=true;persist_blob_target_size=16777216;persist_compaction_memory_bound_bytes=83886080;persist_enable_incremental_compaction=true;persist_use_critical_since_catalog=true;persist_use_critical_since_snapshot=false;persist_use_critical_since_source=false;persist_part_decode_format=arrow;persist_blob_cache_scale_with_threads=true;persist_state_update_lease_timeout=1s;persist_validate_part_bounds_on_read=false;persist_validate_part_bounds_on_write=false;statement_logging_default_sample_rate=1.0;statement_logging_max_data_credit=;statement_logging_max_sample_rate=1.0;statement_logging_target_data_rate=;storage_reclock_to_latest=true;storage_source_decode_fuel=100000;storage_statistics_collection_interval=1000;storage_statistics_interval=2000;storage_use_continual_feedback_upsert=true;default_cluster_replication_factor=1 + - MZ_NO_EXTERNAL_CLUSTERD=1 + - MZ_TIMESTAMP_ORACLE_URL=postgres://root@postgres-metadata:26257?options=--search_path=tsoracle + - MZ_NO_BUILTIN_POSTGRES=1 + - MZ_NO_BUILTIN_COCKROACH=1 + - MZ_ADAPTER_STASH_URL=postgres://root@postgres-metadata:26257?options=--search_path=adapter + - MZ_LISTENERS_CONFIG_PATH=/listeners_config + volumes: + - /home/mitch/src/customer/customer-materialize/materialize/src/materialized/ci/listener_configs/testdrive.json:/listeners_config + - mzdata:/mzdata + - mydata:/var/lib/mysql-files + - tmp:/share/tmp + - scratch:/scratch + tmpfs: + - /tmp + healthcheck: + test: + - CMD + - curl + - -f + - localhost:6878/api/readyz + interval: 1s + start_period: 600s + stop_grace_period: 120s + platform: linux/amd64 + image: materialize/materialized:latest + workload: + depends_on: + materialized: + condition: service_healthy + redpanda: + condition: service_healthy + environment: + - PGHOST=materialized + - PGPORT=6875 + - PGUSER=materialize + - KAFKA_BROKER=kafka:9092 + - SCHEMA_REGISTRY_URL=http://schema-registry:8081 + platform: linux/amd64 + image: materialize-workload:latest +networks: {} +volumes: + mzdata: null + pgdata: null + mysqldata: null + mssqldata: null + sourcedata_512Mb: + driver_opts: + device: tmpfs + type: tmpfs + o: size=512m + mydata: null + tmp: null + secrets: null + scratch: null diff --git a/antithesis/scratchbook/bug-candidates.md b/antithesis/scratchbook/bug-candidates.md new file mode 100644 index 0000000000000..f90d8b377a706 --- /dev/null +++ b/antithesis/scratchbook/bug-candidates.md @@ -0,0 +1,161 @@ +# Bug Candidates for Antithesis Reproduction + +Bugs found by mining the Materialize git history for timing/concurrency fixes +that Antithesis's deterministic scheduling would reliably find. + +## 1. Persist Lease Race (Best Candidate) + +**Commit**: `43f024da36` — "persist: Make sure to obtain a lease before selecting a batch" +**PR**: #35554 +**Severity**: Production incident — read-time halt +**Category**: TOCTOU race + +### The Bug + +Persist uses "seqno leases" to prevent GC from deleting batches a reader is +still processing. Before the fix, readers selected a batch *then* obtained a +lease. GC could delete the batch in between: + +``` +Reader GC +────── ── +1. snapshot() at SeqNo 5 + → picks BatchA (blob: part-0001) + 2. Compaction merges BatchA away → SeqNo 6 + 3. seqno_since advances (no lease on 5) + 4. Deletes part-0001 from blob storage +5. lease_seqno() → SeqNo 7 (too late) +6. fetch(BatchA) → 404 → HALT +``` + +The fix reorders to: lease first, then select batch. The lease prevents GC +from advancing past the leased SeqNo. + +### Code Paths Affected + +- `Listen::next` (read.rs:287) — continuous feed that hydrates MVs. Runs in + the background for every materialized view with an active source. This is the + most natural trigger — always active, exercises the lease path on every new + batch. +- `snapshot_cursor` (read.rs:1176) — used by "persist peeks" (SELECT on + unindexed tables). Less common than the listen path. +- `snapshot_and_fetch` (read.rs:889) — used by catalog ops and txn-WAL reads. + +All three now go through `snapshot_batches()` (read.rs:846), which does +lease-then-snapshot. + +### Workload to Trigger + +Simple mixed read/write traffic exercises the listen path: +- Continuous INSERTs into a table (creates new batches → SeqNo churn → GC pressure) +- A materialized view over that table (its listen is always running) +- Concurrent SELECTs on the MV (served from in-memory arrangements, but + the listen feeding the MV is the actual race target) + +Compaction and GC run automatically in the background. Antithesis's scheduler +can interleave GC between batch selection and lease acquisition. + +### Properties + +- `persist-cas-monotonicity` — batch data should never disappear +- `critical-reader-fence-linearization` — leases should protect batches +- Workload-side: reads never hang or error unexpectedly +- SUT-side: the panic at read.rs:864 fires if a batch is missing after the + upper advanced (added by the fix — would need to be preserved in a + revert-and-detect test) + +### Testing Notes + +A pure `git revert` of `43f024da36` removes both the fix AND the panic that +detects the impossible state. To validate, surgically revert only the ordering +(put lease back after snapshot) while keeping the panic or replacing it with +`assert_unreachable!`. + +--- + +## 2. Compute Dependency Frontier Race + +**Commit**: `42a22b7ff5` — "compute: fix a race condition in collecting dependency frontiers" +**Severity**: Compute controller panic +**Category**: TOCTOU — check-then-act across async boundary + +### The Bug + +The compute controller checked whether storage collections existed, then +collected their frontiers in a second step. Collections could be dropped +between the two steps: + +``` +Step 1: check_exists(collection_id) → true + storage drops collection_id +Step 2: collections_frontiers([collection_id]) → panic! missing key +``` + +Fix: replaced the two-step check-then-read with a single +`collection_frontiers(id).ok()` that handles missing collections atomically. + +**File**: `src/compute-client/src/controller/instance.rs` + +### Workload to Trigger + +Rapid concurrent DDL — CREATE/DROP of sources and MVs while the compute +controller is resolving dependency frontiers. + +### Properties + +- `compute-replica-epoch-isolation` +- System should never panic from DDL operations + +--- + +## 3. Reclock Upper Race with as_of + +**Commit**: `e3805ad790` — "Fetch latest upper in reclock to avoid races with as_of" +**Severity**: Panic (fixes database-issues#8698) +**Category**: Stale cached value in timing-sensitive decision + +A cached `upper` became stale between caching and `as_of` calculation, causing +panic when `as_of > upper`. + +### Properties + +- `strict-serializable-reads` + +--- + +## 4. MV-Sink Discarding Valid Batch Descriptions + +**Commit**: `0886c94dc2` — "mv-sink: stop discarding valid batch descriptions" +**Severity**: Silent data loss +**Category**: Stale frontier view + +Incorrect persist frontier view caused valid batch descriptions to be rejected +as "outdated." No crash, no error — just silently dropped data. + +### Properties + +- `mv-reflects-source-updates` + +--- + +## 5. Introspection Collection Frontier Regression + +**Commit**: `ec4f8996bb` — "compute: avoid frontier regressions for introspection collections" +**Severity**: Frontier monotonicity violation +**Category**: Initialization ordering mismatch + +### Properties + +- `persist-cas-monotonicity` + +--- + +## 6. as_of Selection Upper Constraint Bugs + +**Commit**: `e6ca4801fa` — "as_of_selection: fix two bugs around upper constraints" +**Severity**: 0dt upgrade availability blocked +**Category**: Incorrect boundary calculation + +### Properties + +- `deployment-promotion-safety` diff --git a/antithesis/scratchbook/deployment-topology.md b/antithesis/scratchbook/deployment-topology.md new file mode 100644 index 0000000000000..b03f0aa469449 --- /dev/null +++ b/antithesis/scratchbook/deployment-topology.md @@ -0,0 +1,157 @@ +# Deployment Topology: Materialize + +## Approach: mzcompose-Generated Docker Compose + +The most straightforward path is to use Materialize's **mzcompose** framework to generate the Docker Compose configuration for Antithesis. mzcompose already defines all the service classes, health checks, environment variables, and dependencies needed to run a complete Materialize test environment. + +**Strategy**: Write an `mzcompose.py` file that defines the Antithesis test topology, use mzcompose to generate the Docker Compose YAML, then adapt it for Antithesis (adding test template mounts). + +## Topology Overview + +``` ++---------------------+ +---------------------+ +| workload-client | ---> | materialized | +| (test driver, | <--- | (environmentd + | +| Antithesis SDK, | | embedded clusterd) | +| test templates) | | | ++---------------------+ +---------+-----------+ + | + +------------------+------------------+ + | | | + v v v + +----------------+ +----------------+ +----------------+ + | postgres- | | minio | | redpanda | + | metadata | | (blob storage) | | (Kafka-compat) | + | (consensus) | | | | | + +----------------+ +----------------+ +----------------+ +``` + +## Container Specifications + +### 1. postgres-metadata (Dependency) + +| | | +|---|---| +| **Role** | Metadata store / consensus for persist and catalog | +| **Image** | `postgres:16` (or mzcompose's `PostgresMetadata` service) | +| **Why** | Default metadata store in modern mzcompose. Lighter than CockroachDB. Sufficient for single-node testing. | +| **Ports** | 5432 | +| **Health check** | `pg_isready -U postgres` | +| **Network connections** | materialized reads/writes catalog and persist consensus | +| **Replicas** | 1 | + +PostgreSQL is the default metadata store in modern Materialize testing (`EXTERNAL_METADATA_STORE=postgres-metadata`). CockroachDB is an alternative but adds complexity and state space without benefit for single-coordinator testing. + +### 2. minio (Dependency) + +| | | +|---|---| +| **Role** | S3-compatible blob storage for persist data | +| **Image** | `minio/minio` (or mzcompose's `Minio` with `setup_materialize=True`) | +| **Why** | Persist stores all durable data (source data, MV data, catalog snapshots) in blob storage. MinIO is the standard test substitute for S3. | +| **Ports** | 9000 (S3 API), 9001 (console) | +| **Health check** | `curl --fail http://localhost:9000/minio/health/live` | +| **Network connections** | materialized writes/reads persist blobs | +| **Replicas** | 1 | +| **Config** | Pre-create `/data/persist` bucket. `MINIO_STORAGE_CLASS_STANDARD=EC:0` | + +### 3. redpanda (Dependency) + +| | | +|---|---| +| **Role** | Kafka-compatible message broker for stream source ingestion | +| **Image** | `redpandadata/redpanda` (or mzcompose's `Redpanda` service) | +| **Why** | Enables testing the Kafka source ingestion path, which is the most common production use case. Redpanda is lighter than Kafka+Zookeeper and includes a built-in Schema Registry. | +| **Ports** | 9092 (Kafka API), 8081 (Schema Registry) | +| **Health check** | `rpk cluster health` | +| **Network connections** | materialized reads source data; workload-client may produce test data | +| **Replicas** | 1 | + +### 4. materialized (Service — SUT) + +| | | +|---|---| +| **Role** | The system under test. Runs environmentd (coordinator) with embedded clusterd (compute/storage workers). | +| **Image** | `materialized` (mzcompose's `Materialized` service, built via `mzbuild`) | +| **Why** | This is the core SUT. The embedded clusterd mode runs everything in one process, simplifying the topology while still exercising all three layers (adapter, compute, storage). | +| **Ports** | 6875 (pgwire), 6876-6878 (API/admin), 6879 (persist pubsub), 26257 (pg-compat) | +| **Health check** | `curl -f localhost:6878/api/readyz` (interval 1s, start_period 600s) | +| **Network connections** | postgres-metadata (consensus), minio (blob), redpanda (sources) | +| **Replicas** | 1 | +| **Key environment** | `MZ_NO_TELEMETRY=1`, `MZ_SOFT_ASSERTIONS=1`, `MZ_CATALOG_STORE=persist`, `MZ_BOOTSTRAP_ROLE=materialize`, `MZ_UNSAFE_MODE=1` | +| **Key command args** | `--unsafe-mode`, `--persist-blob-url=s3://minioadmin:minioadmin@persist/persist?endpoint=http://minio:9000/®ion=minio`, `--environment-id=...` | +| **Depends on** | postgres-metadata, minio | + +**Design decision**: Use embedded clusterd (single process) rather than separate clusterd containers. This reduces state space while still exercising all code paths. Separate clusterd testing can be added as a second topology later. + +### 5. workload-client (Client — Test Driver) + +| | | +|---|---| +| **Role** | Runs Antithesis test commands. Emits `setup_complete`. Contains test templates. | +| **Image** | Custom image built on top of testdrive or a Python-based client | +| **Why** | Exercises the system via SQL (pgwire), produces Kafka messages, and asserts properties via the Antithesis SDK. | +| **Ports** | None exposed | +| **Network connections** | materialized (pgwire:6875), redpanda (Kafka:9092, SR:8081) | +| **Replicas** | 1 | +| **Test template mount** | `/opt/antithesis/test/v1/materialize/` | + +The workload client needs: +1. PostgreSQL client library (psycopg2 or psql) to issue SQL +2. Kafka producer library to push test data +3. Antithesis Python SDK for assertions and lifecycle signals +4. Test command scripts with appropriate prefixes (`first_`, `parallel_driver_`, `eventually_`, `finally_`) + +## SDK Selection + +| Component | Language | SDK Needed | +|-----------|----------|------------| +| workload-client | Python | `antithesis-sdk` Python package — for assertions, lifecycle signals | +| materialized (optional, future) | Rust | `antithesis-sdk` Rust crate — for SUT-side reachability/safety assertions | + +The workload client **must** have the SDK for emitting assertions. SUT-side Rust SDK instrumentation is optional but recommended for deeper coverage of internal invariants (persist CaS correctness, frontier monotonicity, catalog consistency). + +## mzcompose Integration Path + +### Option A: Static Docker Compose (Recommended for v1) + +1. Write an `mzcompose.py` that defines the topology above +2. Run `mzcompose --find antithesis gen-docker-compose` (or equivalent) to emit YAML +3. Add any Antithesis-specific adaptations as needed +4. Place the resulting `docker-compose.yml` in `guest/opt/materialize/` + +### Option B: Dynamic mzcompose (Future) + +1. Package the entire mzcompose framework into the workload-client image +2. Use a `first_` test command to generate and start the compose topology +3. More flexible but more complex; requires mzcompose to work inside Antithesis + +Option A is the pragmatic choice. It generates a compose file that Antithesis can directly manage. + +## Workload Design (High Level) + +Test commands in `/opt/antithesis/test/v1/materialize/`: + +| Command | Type | Purpose | +|---------|------|---------| +| `first_setup.sh` | first_ | Create sources, materialized views, tables. Establish baseline state. | +| `parallel_driver_sql_workload.py` | parallel_driver_ | Continuously run SQL operations: INSERTs, SELECTs, CREATE/DROP views. Assert consistency properties. | +| `parallel_driver_kafka_producer.py` | parallel_driver_ | Produce messages to Kafka topics. Verify they appear in materialized views. | +| `eventually_consistency_check.py` | eventually_ | Verify that all acknowledged writes are visible in materialized views. | +| `finally_invariant_check.py` | finally_ | Final consistency sweep: compare source data with MV contents. | +| `anytime_health_check.sh` | anytime_ | Verify system health endpoint and basic SQL connectivity. | + +## Assumptions + +- Embedded clusterd (single process) is sufficient for initial testing +- PostgreSQL is the preferred metadata store (simpler than CockroachDB) +- Redpanda is preferred over Kafka+Zookeeper (lighter, built-in schema registry) +- The workload client will be Python-based (leveraging existing testdrive patterns) +- Static Docker Compose generation (Option A) is the right starting point + +## Open Questions + +- Should we also test with external clusterd processes (separate compute replicas)? +- Should materialized be subject to fault injection, or only the network between it and dependencies? +- What is the best base image for the workload client — extend the existing testdrive image or build from scratch? +- Should the workload client use testdrive's `.td` format or raw SQL via psycopg? diff --git a/antithesis/scratchbook/existing-assertions.md b/antithesis/scratchbook/existing-assertions.md new file mode 100644 index 0000000000000..8e423c26a0415 --- /dev/null +++ b/antithesis/scratchbook/existing-assertions.md @@ -0,0 +1,37 @@ +# Existing Antithesis SDK Assertions + +## Summary + +**No Antithesis SDK assertions exist in the Materialize source code.** + +A comprehensive search of the Rust codebase at `materialize/src/` found: + +- No `use antithesis` import statements +- No Cargo.toml dependencies on any antithesis crate +- No assertion macros: `assert_always!`, `assert_sometimes!`, `assert_reachable!`, `assert_unreachable!` +- No antithesis function calls in the Python test code within the materialize repository + +## Existing Antithesis Integration (Customer Level) + +Antithesis integration exists at the **customer-repo level** (outside the materialize source), using the legacy experiment-script approach: + +### Experiment Scripts (`guest/opt/antithesis/experiment/`) + +- **`materialize.py`**: Docker Compose-based experiment. Uses `antithesis.start_customer_containers()`, `antithesis.start_fault_injector()`, `antithesis.run_process()`, `antithesis.fuzz_msg()`, `antithesis.end_test()`. Orchestrates testdrive workloads with network chaos (latency, packet loss, partitions). +- **`testdrive.py`**: K8s-based variant. Sets up k3s cluster with minio, redpanda, postgres, environmentd. Runs testdrive via kubectl. +- **`materialize-k8s.sh`**: Bash setup for K8s resources. + +### Docker Compose Topology (`guest/opt/materialize/docker-compose.yml`) + +Uses custom Antithesis-instrumented images: +- `antithesis-cp-combined` (Kafka + Schema Registry) +- `antithesis-materialized` (Materialize) +- `antithesis-testdrive` (Test workload) + +### K8s Manifests (`guest/opt/materialize/k8s/antithesis/`) + +Full Kubernetes topology: environmentd StatefulSet, postgres StatefulSet, redpanda Deployment, testdrive Pod, with PVs and services. + +## Implications for New Work + +All property assertions will need to be added fresh. The existing integration provides a starting point for topology but uses an older approach (experiment scripts, custom instrumented images). The new approach should leverage mzcompose for compose generation and add Antithesis SDK assertions either in the workload client or (for deeper coverage) in the Materialize Rust source. diff --git a/antithesis/scratchbook/properties/catalog-recovery-consistency.md b/antithesis/scratchbook/properties/catalog-recovery-consistency.md new file mode 100644 index 0000000000000..8b581a99adf60 --- /dev/null +++ b/antithesis/scratchbook/properties/catalog-recovery-consistency.md @@ -0,0 +1,33 @@ +# catalog-recovery-consistency + +## Summary +After coordinator crash and restart, the catalog state is consistent: upper never decreases, snapshot is consolidated, all committed transactions visible. + +## Evidence + +### Code Paths +- `src/catalog/src/durable/persist.rs:536-539` — `sync_to_current_upper` +- `src/catalog/src/durable/persist.rs:575-577` — ListenEvent::Progress antichain logic +- `src/catalog/src/durable/persist.rs:706-724` — `consolidate` method +- `src/catalog/src/durable/persist.rs:593-612` — sync applies updates by timestamp, consolidates after each +- `src/catalog/src/durable/persist.rs:1092` — Assertion on snapshot consolidation +- `src/catalog/src/durable/persist.rs:1167-1170` — Fence token generation syncs to upper + +### How It Works +On startup, the coordinator reads the persist shard from the latest rollup + incremental diffs, reconstructing the full catalog state. `sync_to_current_upper()` applies all updates up to the current upper antichain and consolidates the snapshot. The existing code has a debug assertion at line 1092 checking consolidation. + +### What Goes Wrong on Violation +- Upper regression: coordinator sees older schema state than what was committed, losing recent DDL +- Unconsolidated snapshot: duplicate entries cause incorrect catalog lookups, potential panics +- Missing transactions: committed DDL not visible after restart, users lose tables/views + +### Key Subtlety +Crash during `maybe_consolidate()` (lines 596, 610) could leave the snapshot in an intermediate state. On restart, the next sync must handle this gracefully by reconsolidating from the durable upper. + +### SUT-Side Instrumentation Notes +- No existing Antithesis assertions (debug_assert exists at line 1092 but only in debug builds) +- Candidate: After `sync_to_current_upper()`, add `assert_always!` that upper >= previous upper +- Candidate: After consolidation, add `assert_always!` that no duplicate (kind, key) entries exist + +### Provenance +Surfaced by Failure Recovery focus (merged from catalog-upper-monotonicity and catalog-snapshot-consolidation). diff --git a/antithesis/scratchbook/properties/command-channel-ordering.md b/antithesis/scratchbook/properties/command-channel-ordering.md new file mode 100644 index 0000000000000..0f47965189999 --- /dev/null +++ b/antithesis/scratchbook/properties/command-channel-ordering.md @@ -0,0 +1,28 @@ +# command-channel-ordering + +## Summary +Timely workers must see CreateDataflow commands in identical order — code explicitly acknowledges this is not guaranteed by Timely. + +## Evidence + +### Code Paths +- `src/compute/src/command_channel.rs:88-90` — Comment: "relies on Timely channels preserving order of inputs, which is not something they guarantee" +- `src/compute/src/command_channel.rs:96-100` — Source operator activation sequence +- `src/compute/src/command_channel.rs:41-58` — Sender using `Arc` activator + +### How It Works +The command channel broadcasts commands from worker 0 to all other Timely workers via a Timely dataflow operator. Commands are fed in order, but the code explicitly notes that Timely does not guarantee preservation of input ordering. + +### What Goes Wrong on Violation +Workers execute dataflows in different orders, causing divergent state. Since all workers must agree on dataflow state for correct results, reordering leads to inconsistent query results or panics during distributed computation. + +### Why This Is an Antithesis Target +This is the kind of bug that almost never manifests in normal testing because thread scheduling is usually consistent. Antithesis's deterministic scheduling exploration can systematically vary worker activation timing to expose reordering. + +### SUT-Side Instrumentation Notes +- No existing Antithesis assertions +- Candidate: On each worker, log the command sequence and add `assert_always!` that worker N's command sequence matches worker 0's +- This is a strong candidate for SUT-side instrumentation since the invariant is internal to the compute engine + +### Provenance +Surfaced by Concurrency focus. diff --git a/antithesis/scratchbook/properties/compute-replica-epoch-isolation.md b/antithesis/scratchbook/properties/compute-replica-epoch-isolation.md new file mode 100644 index 0000000000000..019445cc28632 --- /dev/null +++ b/antithesis/scratchbook/properties/compute-replica-epoch-isolation.md @@ -0,0 +1,25 @@ +# compute-replica-epoch-isolation + +## Summary +Compute replica incarnations are isolated by epoch — commands from old epochs cannot execute after a new epoch starts. + +## Evidence + +### Code Paths +- `src/compute-client/src/controller/replica.rs:70-107` — Epoch at line 93, ReplicaTask at line 146 +- `src/compute-client/src/protocol/command.rs:45-54` — Hello command with nonce for protocol iteration +- `src/compute-client/src/controller/replica.rs:142-144` — Task abortion on rehydration clears old commands + +### How It Works +Each replica incarnation gets a unique epoch (nonce + u64). On rehydration, the controller aborts the old ReplicaTask and creates a new one with an incremented epoch. The Hello command includes the new nonce, and the replica rejects commands with mismatched nonces. + +### What Goes Wrong on Violation +Stale commands from a previous incarnation execute on the new replica, causing it to diverge from the coordinator's expected state. Query results become inconsistent across replicas. + +### SUT-Side Instrumentation Notes +- No existing Antithesis assertions +- Candidate: On command receipt, add `assert_always!(command.epoch >= current_epoch)` in the replica's command handler +- Candidate: After rehydration, add `assert_reachable!` that the new epoch is used for the first command + +### Provenance +Surfaced by Distributed Coordination focus. diff --git a/antithesis/scratchbook/properties/critical-reader-fence-linearization.md b/antithesis/scratchbook/properties/critical-reader-fence-linearization.md new file mode 100644 index 0000000000000..5da820a7d464c --- /dev/null +++ b/antithesis/scratchbook/properties/critical-reader-fence-linearization.md @@ -0,0 +1,24 @@ +# critical-reader-fence-linearization + +## Summary +Critical reader opaque token comparison linearizes correctly — concurrent readers cannot bypass the fencing mechanism. + +## Evidence + +### Code Paths +- `src/persist-client/src/internal/state.rs:1937-1979` — `compare_and_downgrade_since()` with opaque fencing +- `src/persist-client/src/critical.rs` — `CriticalReaderId` and `Opaque` definitions + +### How It Works +Critical readers hold a `since` frontier that prevents GC of data at held timestamps. The `compare_and_downgrade_since` operation uses an opaque token to fence: the caller provides `expected_opaque`, and if it doesn't match the current opaque in state, the operation fails (but still commits a SeqNo increment to prevent ABA). Only the caller with the correct opaque can advance the since. + +### What Goes Wrong on Violation +If fencing is bypassed, two readers could both think they hold the since, leading to premature GC. Data needed by active readers is deleted, causing read failures or panics. + +### SUT-Side Instrumentation Notes +- No existing Antithesis assertions +- Candidate: After successful downgrade, add `assert_always!(state.opaque == my_opaque)` to confirm fencing +- Candidate: On mismatch, add `assert_always!(seqno_advanced)` to confirm ABA prevention + +### Provenance +Surfaced by Data Integrity focus. diff --git a/antithesis/scratchbook/properties/deployment-lag-detection.md b/antithesis/scratchbook/properties/deployment-lag-detection.md new file mode 100644 index 0000000000000..213c3dd2f904b --- /dev/null +++ b/antithesis/scratchbook/properties/deployment-lag-detection.md @@ -0,0 +1,26 @@ +# deployment-lag-detection + +## Summary +0DT caught-up check eventually detects lagging or crash-looping replicas and blocks promotion. + +## Evidence + +### Code Paths +- `src/adapter/src/coord/caught_up.rs:53-150` — `maybe_check_caught_up` with replica frontier snapshot +- `src/adapter/src/coord/caught_up.rs:127-136` — Lag comparison against allowed threshold +- `src/adapter/src/coord/caught_up.rs:145-149` — `problematic_replicas` detection +- Dynamic configs: `WITH_0DT_CAUGHT_UP_CHECK_ALLOWED_LAG`, `ENABLE_0DT_CAUGHT_UP_REPLICA_STATUS_CHECK` + +### How It Works +Periodically during catchup, the coordinator queries `MZ_CLUSTER_REPLICA_FRONTIERS` and compares each replica's frontier against the expected threshold. If any replica's frontier lags beyond `allowed_lag`, promotion is blocked. Additionally, `analyze_replica_looping()` checks `mz_cluster_replica_status_history` for crash patterns. + +### What Goes Wrong on Violation +If a stuck/crashing replica is not detected, promotion proceeds with an unhealthy replica. Post-promotion, queries routed to that replica fail or return stale results. + +### SUT-Side Instrumentation Notes +- No existing Antithesis assertions +- Candidate: Add `assert_sometimes!(lagging_replica_blocked_promotion)` to confirm the detection path is exercised +- This is a liveness property — we want to confirm the system can detect the problem, not just that it doesn't happen + +### Provenance +Surfaced by Lifecycle focus. diff --git a/antithesis/scratchbook/properties/deployment-promotion-safety.md b/antithesis/scratchbook/properties/deployment-promotion-safety.md new file mode 100644 index 0000000000000..e6794631a0aec --- /dev/null +++ b/antithesis/scratchbook/properties/deployment-promotion-safety.md @@ -0,0 +1,26 @@ +# deployment-promotion-safety + +## Summary +0DT deployment promotion happens only after all replicas have caught up to required frontiers. + +## Evidence + +### Code Paths +- `src/environmentd/src/deployment/state.rs:92-108` — `set_ready_to_promote` transitions Initializing->CatchingUp->ReadyToPromote +- `src/environmentd/src/deployment/preflight.rs:57-120` — `preflight_0dt` with `caught_up_max_wait` and `caught_up_trigger` +- `src/adapter/src/coord/caught_up.rs:53-150` — Replica frontier checks via `MZ_CLUSTER_REPLICA_FRONTIERS` +- `src/catalog/src/durable/error.rs:115-124` — `FenceError::DeployGeneration` + +### How It Works +During 0DT deployment, the new coordinator boots in read-only mode. It runs preflight checks including `maybe_check_caught_up()` which compares replica frontiers against a cutoff threshold. Only after all replicas pass the check does the coordinator transition to ReadyToPromote. On promotion, the deployment generation is incremented, fencing out the old coordinator. + +### What Goes Wrong on Violation +Premature promotion causes the new coordinator to serve queries while replicas are still rehydrating from storage. Users see stale data or timeouts. In the worst case, the old coordinator continues writing with a lower generation, causing split-brain. + +### SUT-Side Instrumentation Notes +- No existing Antithesis assertions +- Candidate: At promotion time, add `assert_always!` that all tracked replica frontiers >= cutoff +- Candidate: Add `assert_reachable!("0dt_promotion_completed")` to confirm the promotion path is exercised + +### Provenance +Surfaced by Lifecycle and Distributed Coordination focuses. diff --git a/antithesis/scratchbook/properties/epoch-fencing-prevents-split-brain.md b/antithesis/scratchbook/properties/epoch-fencing-prevents-split-brain.md new file mode 100644 index 0000000000000..3fb5167f9edf7 --- /dev/null +++ b/antithesis/scratchbook/properties/epoch-fencing-prevents-split-brain.md @@ -0,0 +1,35 @@ +# epoch-fencing-prevents-split-brain + +## Summary +Epoch-based leader fencing prevents two coordinators from concurrently writing to the catalog persist shard. + +## Evidence + +### Code Paths +- `src/catalog/src/durable/persist.rs:149-169` — `FenceableToken::validate()` and `maybe_fence()` check epoch on every write +- `src/catalog/src/durable/persist.rs:393-461` — `compare_and_append` with fence validation before consensus write +- `src/catalog/src/durable/error.rs:114-131` — `FenceError` enum: `DeployGeneration` and `Epoch` variants +- `src/catalog/src/durable/persist.rs:1166-1192` — Fence token generation during `open_inner` +- `src/environmentd/src/deployment/state.rs:24-123` — Deployment state machine transitions + +### How It Works +On startup, the coordinator reads the current fence token from consensus and increments the epoch. The new token is written via CaS. All subsequent writes include the token; if consensus contains a higher epoch, the write fails with `FenceError::Epoch`. + +### What Goes Wrong on Violation +Two coordinators with the same epoch could both write catalog mutations, leading to divergent schema state. Users would see inconsistent table definitions, lost DDL operations, or catalog corruption requiring manual intervention. + +### Failure Scenario +1. Coordinator A is running with epoch 10 +2. Coordinator A becomes partitioned from consensus +3. Coordinator B starts, reads epoch 10, increments to epoch 11 +4. Partition heals; A attempts to write with epoch 10 +5. **Expected**: A's write fails with FenceError +6. **Bug**: If A's CaS succeeds despite lower epoch (race in validation) + +### SUT-Side Instrumentation Notes +- No existing Antithesis assertions in codebase +- Candidate instrumentation point: `FenceableToken::validate()` — add `assert_always!` that validates token comparison result matches expected fencing behavior +- Candidate: `compare_and_append` success path — add `assert_always!` that current_epoch >= write_epoch + +### Provenance +Surfaced independently by Distributed Coordination and Failure Recovery focuses. diff --git a/antithesis/scratchbook/properties/fault-recovery-exercised.md b/antithesis/scratchbook/properties/fault-recovery-exercised.md new file mode 100644 index 0000000000000..d6499991da5a6 --- /dev/null +++ b/antithesis/scratchbook/properties/fault-recovery-exercised.md @@ -0,0 +1,28 @@ +# fault-recovery-exercised + +## Summary +After coordinator crash, the system eventually recovers and serves queries. + +## Evidence + +### Code Paths +- `src/environmentd/src/environmentd/main.rs` — Main startup, catalog recovery +- `src/environmentd/src/http/probe.rs` — `/health/ready` endpoint +- `src/catalog/src/durable/persist.rs:1166-1192` — `open_inner` recovery path + +### How It Works +On restart, environmentd re-reads the catalog from persist, increments the epoch, rehydrates compute/storage clusters, and starts accepting connections. The readiness probe (`/health/ready`) returns 200 only after the adapter is fully initialized. + +### What Goes Wrong on Violation +The system fails to recover: it crashes on startup due to corrupt catalog state, enters an infinite restart loop, or becomes ready but cannot serve queries due to incomplete rehydration. + +### Why This Is a Property +This is the most fundamental liveness property. It doesn't test a specific invariant — it tests that the entire recovery pipeline works end-to-end under adversarial crash timing. + +### SUT-Side Instrumentation Notes +- No existing Antithesis assertions +- Best verified at workload level: crash environmentd, wait for readiness, issue SELECT query, assert success +- Candidate: Add `assert_sometimes!(recovery_completed_successfully)` after catalog recovery succeeds + +### Provenance +Surfaced by Failure Recovery focus. diff --git a/antithesis/scratchbook/properties/group-commit-toctou-safety.md b/antithesis/scratchbook/properties/group-commit-toctou-safety.md new file mode 100644 index 0000000000000..bae54fcc085cc --- /dev/null +++ b/antithesis/scratchbook/properties/group-commit-toctou-safety.md @@ -0,0 +1,28 @@ +# group-commit-toctou-safety + +## Summary +No phantom writes to tables deleted between write deferral and group_commit execution. + +## Evidence + +### Code Paths +- `src/adapter/src/coord/appends.rs:479-486` — Explicit TOCTOU check: "If the table... has been deleted while the write was deferred" +- `src/adapter/src/coord/appends.rs:214-216` — `defer_op` enqueue point +- `src/adapter/src/coord/appends.rs:394-399` — JIT lock acquisition in group_commit + +### How It Works +When a write arrives and cannot immediately acquire the write lock, it is deferred. Later, group_commit processes deferred writes. Before applying each write, it checks `catalog().try_get_entry(table_id)`. If the table was dropped between deferral and execution, the write is silently dropped. + +### What Goes Wrong on Violation +Writes land in a shard for a table that no longer exists in the catalog. This causes inconsistency between the catalog (table doesn't exist) and persist (shard has data). Downstream queries may panic or return garbage. + +### The TOCTOU Window +The explicit comment at appends.rs:479 acknowledges the race. The window is between line 214 (write enqueued) and line 484 (catalog check during group_commit). Concurrent DDL (DROP TABLE) within this window is the trigger. + +### SUT-Side Instrumentation Notes +- No existing Antithesis assertions +- Candidate: After group_commit drops a deferred write, add `assert_reachable!("group_commit_dropped_deferred_write_to_deleted_table")` to confirm this path is exercised +- Candidate: After group_commit succeeds, add `assert_always!` that all written table_ids still exist in catalog + +### Provenance +Surfaced by Concurrency focus. diff --git a/antithesis/scratchbook/properties/idempotent-write-under-indeterminate.md b/antithesis/scratchbook/properties/idempotent-write-under-indeterminate.md new file mode 100644 index 0000000000000..0837770823d9a --- /dev/null +++ b/antithesis/scratchbook/properties/idempotent-write-under-indeterminate.md @@ -0,0 +1,28 @@ +# idempotent-write-under-indeterminate + +## Summary +Compare-and-append retries with the same idempotency token produce exactly one committed write — never duplicates, never loss. + +## Evidence + +### Code Paths +- `src/persist-client/src/internal/machine.rs:387-468` — Detailed comments on Indeterminate error handling and retry-with-idempotency-token +- `src/persist-client/src/internal/state.rs:1687` — `compare_and_append` function +- `src/persist-client/src/write.rs:409` — Retry wrapper with `IdempotencyToken` +- `src/persist-client/src/internal/state.rs:1715-1724` — Writer state and lease tracking + +### How It Works +Each writer holds an `IdempotencyToken`. On Indeterminate error, the retry includes the same token. The state machine checks if a write with that token already succeeded (checking writer state). If so, it returns `AlreadyCommitted`. If not, it proceeds normally. + +### What Goes Wrong on Violation +Duplicate writes: the shard contains two copies of the same batch, leading to double-counting in materialized views. Or lost writes: the batch is neither committed nor retried successfully, causing data loss. + +### Key Subtlety +The comments at machine.rs:387-468 describe subtle scenarios where the writer must distinguish between "my write succeeded but I didn't get the ack" vs "my write failed and I need to retry." The IdempotencyToken is the mechanism, but the window between consensus write and state observation is where bugs hide. + +### SUT-Side Instrumentation Notes +- No existing Antithesis assertions +- Candidate: After Indeterminate retry, add `assert_always!` that shard trace contains exactly one instance of the batch + +### Provenance +Surfaced by Data Integrity focus. diff --git a/antithesis/scratchbook/properties/mv-reflects-source-updates.md b/antithesis/scratchbook/properties/mv-reflects-source-updates.md new file mode 100644 index 0000000000000..a500f32fb1b0a --- /dev/null +++ b/antithesis/scratchbook/properties/mv-reflects-source-updates.md @@ -0,0 +1,32 @@ +# mv-reflects-source-updates + +## Summary +Materialized views eventually reflect changes to their source data. + +## Evidence + +### Code Paths +- `src/compute/src/render/` — Dataflow rendering for materialized views +- `src/compute/src/server.rs` — Compute server receives commands and renders dataflows +- `src/adapter/src/coord/sequencer/` — CREATE MATERIALIZED VIEW sequencing + +### How It Works +When source data changes, differential dataflow operators in the compute layer process the deltas and update the materialized view's persist shard. The MV's frontier advances as updates are committed. + +### What Goes Wrong on Violation +MVs show stale data permanently despite source updates. Users query a materialized view expecting fresh data and get results that never update. This is the core value proposition failure. + +### Why This Is an End-to-End Property +Unlike internal properties (epoch fencing, CaS monotonicity), this property is directly observable by users. It combines source ingestion, compute processing, and persist writes into a single check. + +### Workload Verification +1. INSERT INTO table1 VALUES (1, 'test') +2. Wait for MV that SELECTs from table1 +3. SELECT * FROM mv1 — must eventually contain (1, 'test') + +### SUT-Side Instrumentation Notes +- Best verified at workload level via SQL assertions +- Candidate: Add `assert_sometimes!(mv_frontier_advanced)` in the compute persist sink + +### Provenance +Surfaced by Product Context focus. diff --git a/antithesis/scratchbook/properties/peek-lifecycle-exactly-once.md b/antithesis/scratchbook/properties/peek-lifecycle-exactly-once.md new file mode 100644 index 0000000000000..e0a3c6c682336 --- /dev/null +++ b/antithesis/scratchbook/properties/peek-lifecycle-exactly-once.md @@ -0,0 +1,35 @@ +# peek-lifecycle-exactly-once + +## Summary +Each peek command produces exactly one response — no duplicates, no leaks, no orphaned state. + +## Evidence + +### Code Paths +- `src/adapter/src/coord/peek.rs:80-95` — Explicit "1:1 contract between Peek and PeekResponseUnary" comment +- `src/adapter/src/coord/peek.rs:873-920` — Response routing with UUID tracking +- `src/adapter/src/coord/peek.rs:1174-1209` — `cancel_pending_peeks`: removes from client_pending_peeks then pending_peeks +- `src/adapter/src/coord/peek.rs:1256-1268` — `remove_pending_peek`: consistency check between two maps +- `src/adapter/src/coord/peek.rs:1221-1227` — `handle_peek_notification` removes before response + +### How It Works +Peeks are tracked in two maps: `pending_peeks` (UUID -> PendingPeek) and `client_pending_peeks` (ConnectionId -> Set). On response or cancellation, the peek is removed from both maps. Each UUID is unique (generated per-peek). + +### What Goes Wrong on Violation +- Leaked peeks: UUID stays in pending_peeks forever, growing memory until OOM +- Duplicate responses: client receives two result sets for one query +- Missing responses: client hangs waiting for a peek that was silently dropped + +### The Race Condition +The two-map removal (client_pending_peeks + pending_peeks) at lines 1256-1268 is not atomic. If CancelPendingPeeks races with PeekNotification: +1. Cancel removes UUID from client_pending_peeks +2. Peek response arrives, finds UUID in pending_peeks but not in client_pending_peeks +3. Orphaned state or double-processing + +### SUT-Side Instrumentation Notes +- No existing Antithesis assertions +- Candidate: At coordinator shutdown or periodically, add `assert_always!(pending_peeks.is_empty() || active_connections_exist)` to detect leaks +- Candidate: On peek response, add `assert_always!` that UUID existed in pending_peeks before removal + +### Provenance +Surfaced by Protocol Contracts and Concurrency focuses. diff --git a/antithesis/scratchbook/properties/persist-cas-monotonicity.md b/antithesis/scratchbook/properties/persist-cas-monotonicity.md new file mode 100644 index 0000000000000..46ab8e6dd7bfe --- /dev/null +++ b/antithesis/scratchbook/properties/persist-cas-monotonicity.md @@ -0,0 +1,34 @@ +# persist-cas-monotonicity + +## Summary +Persist shard state versions (SeqNo) must never decrease across any observation point. + +## Evidence + +### Code Paths +- `src/persist-client/src/internal/state_versions.rs:48-87` — State version invariants: `earliest <= current.seqno` +- `src/persist-client/src/internal/state.rs:84-95` — `ROLLUP_THRESHOLD` and seqno-based rollup logic +- `src/persist-client/src/internal/state.rs:1324` — Invariant comment on rollup seqno +- `src/persist-client/src/internal/gc.rs` — GC respects seqno ordering +- `src/persist-client/src/write.rs:70-123` — WriteHandle CaS loop context + +### How It Works +Every state mutation increments SeqNo. The CaS loop in Machine reads current state, computes new state with SeqNo+1, and atomically writes via consensus. If another writer interleaved, the CaS fails and the writer retries with the newer SeqNo. Rollups periodically snapshot state; rollup seqno must be <= current seqno. + +### What Goes Wrong on Violation +SeqNo regression means state reconstruction from rollup + diffs produces wrong state. GC could delete diffs that are still needed. Writers could overwrite each other's changes. This is a data corruption scenario. + +### Failure Scenario +1. Writer A reads state at SeqNo 100, begins computing new state +2. Writer B reads state at SeqNo 100, writes SeqNo 101 +3. Writer A attempts to write SeqNo 101 — CaS should fail (current is now 101) +4. **Expected**: A retries, reads SeqNo 101, writes SeqNo 102 +5. **Bug**: If CaS comparison is stale and A's write at 101 succeeds despite B's 101 + +### SUT-Side Instrumentation Notes +- No existing Antithesis assertions +- Candidate: `Machine::apply_unbatched_cmd` — add `assert_always!(new_seqno > old_seqno)` after every state transition +- Candidate: State reconstruction from rollup + diffs — add `assert_always!` that reconstructed state matches expected + +### Provenance +Surfaced by Data Integrity and Distributed Coordination focuses. diff --git a/antithesis/scratchbook/properties/source-ingestion-progress.md b/antithesis/scratchbook/properties/source-ingestion-progress.md new file mode 100644 index 0000000000000..aa3b83c54f9cd --- /dev/null +++ b/antithesis/scratchbook/properties/source-ingestion-progress.md @@ -0,0 +1,27 @@ +# source-ingestion-progress + +## Summary +Kafka source ingestion eventually makes progress — the source frontier advances. + +## Evidence + +### Code Paths +- `src/storage/src/render/sources.rs` — Source operator assembly (Kafka, Postgres, MySQL connectors) +- `src/storage/src/source/reclock.rs` — Timestamp reclocking from source timestamps to Materialize timeline +- `src/storage/src/render/persist_sink.rs` — Writes ingested data to persist shards + +### How It Works +Storage workers connect to external sources (Kafka brokers, Postgres replication slots), read data, reclock timestamps, and write to persist. The source's upper frontier advances as data is ingested and persisted. + +### What Goes Wrong on Violation +Source stalls: materialized views stop updating, users see stale data indefinitely. This is the most visible user-facing failure mode for a streaming database. + +### Why This Is a Liveness Property +We want to confirm the system reaches a state where source data is flowing. Under fault injection (network partitions to Kafka, storage worker crashes), the source should eventually resume and make progress. + +### SUT-Side Instrumentation Notes +- Best verified at workload level: produce N messages to Kafka, query the source table, assert row count eventually reaches N +- Candidate: Add `assert_sometimes!(source_frontier_advanced)` in the persist sink write path + +### Provenance +Surfaced by Product Context focus. diff --git a/antithesis/scratchbook/properties/storage-command-replay-idempotent.md b/antithesis/scratchbook/properties/storage-command-replay-idempotent.md new file mode 100644 index 0000000000000..8046c29c5612e --- /dev/null +++ b/antithesis/scratchbook/properties/storage-command-replay-idempotent.md @@ -0,0 +1,28 @@ +# storage-command-replay-idempotent + +## Summary +Replaying storage command history after reconnection is idempotent — no duplicate ingestion or state divergence. + +## Evidence + +### Code Paths +- `src/storage-controller/src/history.rs:20-80` — CommandHistory reduces and replays +- `src/storage-controller/src/instance.rs:46-80` — Replica rehydration via command history +- `src/storage-controller/src/persist_handles.rs:98-120` — Append retry semantics with Timestamp tracking + +### How It Works +The storage controller maintains a command history for each replica. On reconnection, it replays the reduced history. The history is compacted to remove superseded commands (e.g., only the latest configuration for each source). Sources resume from persisted offsets in persist, not from the beginning. + +### What Goes Wrong on Violation +Duplicate data appears in sources. Since materialized views are computed incrementally from sources, duplicates propagate to all downstream views. Users see incorrect aggregation results (double-counted rows). + +### Key Subtlety +Command history compaction assumes idempotency, but no explicit duplicate detection is observed in the code. If a RunIngestionCommand is partially executed (source starts but crashes before position is persisted), replay could re-ingest data from the last persisted offset, which may differ from the actual last-processed offset. + +### SUT-Side Instrumentation Notes +- No existing Antithesis assertions +- Candidate: After replay, add `assert_always!` that source read position >= position before crash +- Candidate: After ingestion resumes, add `assert_always!` comparing row counts with expected deduplication + +### Provenance +Surfaced by Failure Recovery focus. diff --git a/antithesis/scratchbook/properties/strict-serializable-reads.md b/antithesis/scratchbook/properties/strict-serializable-reads.md new file mode 100644 index 0000000000000..450d623b4c6f3 --- /dev/null +++ b/antithesis/scratchbook/properties/strict-serializable-reads.md @@ -0,0 +1,34 @@ +# strict-serializable-reads + +## Summary +Reads respect the timestamp oracle's linearization point — later reads see all changes visible to earlier reads. + +## Evidence + +### Code Paths +- `src/adapter/src/coord/timestamp_selection.rs:40-52` — When `chosen_ts` differs from `oracle_ts`, peek results must be delayed until oracle catches up +- `src/adapter/src/coord/sequencer/inner.rs:2097-2116` — Strict serializable reads tracked via `strict_serializable_reads_tx` +- `src/adapter/src/coord/timestamp_selection.rs:228-240` — `needs_linearized_read_ts` check +- `src/adapter/src/coord/in_memory_oracle.rs:92-101` — Oracle timestamp advancement + +### How It Works +The coordinator assigns every read a timestamp from the oracle. The oracle maintains a monotonically advancing timestamp. Strict serializable reads wait for the oracle to confirm their timestamp is linearized before returning results. This ensures no read can see a state "in the past" relative to another concurrent read. + +### What Goes Wrong on Violation +Users observe non-repeatable reads: query A at time T sees data that query B at time T+1 does not see. This violates the strict serializability contract that is Materialize's primary differentiator from other streaming systems. + +### Workload-Level Verification +This property is best verified at the workload level: +1. Client A writes row R and receives acknowledgment +2. Client B reads and must see R (or a later state including R) +3. Client C reads and must see at least what B saw + +The workload checks SQL results, not internal state. + +### SUT-Side Instrumentation Notes +- No existing Antithesis assertions +- Candidate: `timestamp_selection.rs` oracle advancement — add `assert_always!` that oracle timestamp never decreases +- Candidate: After peek response, add workload-side `Always` assertion comparing read timestamp ordering with data ordering + +### Provenance +Surfaced by Protocol Contracts focus (merged from timestamp-oracle-linearization and strict-serializable-ordering). diff --git a/antithesis/scratchbook/properties/tombstone-sealing-finality.md b/antithesis/scratchbook/properties/tombstone-sealing-finality.md new file mode 100644 index 0000000000000..bc97da01197ae --- /dev/null +++ b/antithesis/scratchbook/properties/tombstone-sealing-finality.md @@ -0,0 +1,22 @@ +# tombstone-sealing-finality + +## Summary +Once a shard is tombstoned (upper and since both empty antichain), no further mutations are possible. + +## Evidence + +### Code Paths +- `src/persist-client/src/internal/state.rs:2128-2134` — `is_tombstone()` checks upper.is_empty() && since.is_empty() && writers.is_empty() && critical_readers.is_empty() +- `src/persist-client/src/internal/state.rs:1703-1712` — compare_and_append short-circuits on tombstone +- `src/persist-client/src/internal/state.rs:2146-2159` — `become_tombstone_and_shrink()` transition + +### What Goes Wrong on Violation +If a tombstoned shard accepts new writes, deleted tables/views could have data resurrected. This would confuse users and violate the contract that DROP TABLE removes data permanently. + +### SUT-Side Instrumentation Notes +- No existing Antithesis assertions +- Candidate: After `is_tombstone()` returns true, add `assert_always!` that subsequent append attempts return error +- Candidate: `become_tombstone_and_shrink()` — add `assert_unreachable!` after the transition if any subsequent mutation succeeds + +### Provenance +Surfaced by Data Integrity focus. diff --git a/antithesis/scratchbook/property-catalog.md b/antithesis/scratchbook/property-catalog.md new file mode 100644 index 0000000000000..ffbba999a7031 --- /dev/null +++ b/antithesis/scratchbook/property-catalog.md @@ -0,0 +1,217 @@ +--- +commit: ca6deb6758e651876582ae7d4dec24ce32d87567 +updated: 2026-05-06 +--- + +# Property Catalog: Materialize + +## Category 1: Data Integrity Under Faults + +Properties that verify data correctness when crashes, network partitions, and concurrent access interact with the persist layer and catalog. + +### epoch-fencing-prevents-split-brain — Epoch-Based Fencing Prevents Split-Brain Writes + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P0 — fundamental split-brain prevention; failure here corrupts all state | +| **Property** | After a coordinator restart with a higher epoch, the old coordinator (lower epoch) cannot successfully write to the catalog persist shard. | +| **Invariant** | `Always`: once a higher epoch is written to consensus, any compare_and_append from a lower epoch must fail with FenceError. This is a strict safety invariant — every check must hold. | +| **Antithesis Angle** | Network partition separates old coordinator from consensus while new coordinator starts with higher epoch. When partition heals, old coordinator's in-flight writes must be rejected. Antithesis explores the timing window between old coordinator's last successful write and new coordinator's first write. | +| **Why It Matters** | Split-brain writes corrupt the catalog, potentially causing data loss or inconsistent schema state. This is the fundamental distributed safety mechanism. Surfaced by: Distributed Coordination, Failure Recovery. | + +### persist-cas-monotonicity — Persist SeqNo Never Decreases + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P0 — backbone of persist consistency; all other persist properties depend on this | +| **Property** | Persist shard state versions (SeqNo) form a strictly increasing sequence. No writer can observe or apply a lower SeqNo after observing a higher one. | +| **Invariant** | `Always`: for any shard, if SeqNo N is observed, no subsequent observation returns SeqNo < N. Rollups maintain seqno <= seqno_since. This must hold on every check — a single violation means state corruption. | +| **Antithesis Angle** | Partition storage from persist backend mid-write. One writer races to increment SeqNo while another caches an old value and retries. Crash during GC/rollup operations. Antithesis explores interleaving of concurrent CaS loops. | +| **Why It Matters** | SeqNo monotonicity is the backbone of persist's consistency model. Violations cause state reconstruction failures and data loss. Surfaced by: Data Integrity, Distributed Coordination. | + +### tombstone-sealing-finality — Tombstoned Shards Are Immutable + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P1 — prevents zombie writes to dropped collections | +| **Property** | Once a shard's upper and since both advance to the empty antichain (tombstone), no new writes, reader registrations, or writer registrations can succeed. The transition is irreversible. | +| **Invariant** | `Always`: after `is_tombstone()` returns true, any append, downgrade_since, or registration attempt must fail. The state machine must never revert from tombstone. | +| **Antithesis Angle** | Crash and restart after tombstone. Fire concurrent write/read attempts while state is being replayed from consensus. Antithesis explores whether recovery code can accidentally un-tombstone a shard. | +| **Why It Matters** | Tombstone finality prevents zombie writes to dropped collections. Violation could resurface deleted data. Surfaced by: Data Integrity. | + +### idempotent-write-under-indeterminate — Compare-and-Append Idempotency on Retry + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P1 — indeterminate errors are the hardest distributed systems edge case | +| **Property** | When compare_and_append receives an Indeterminate error from consensus and retries with the same idempotency token, the shard contains exactly one copy of the write — never zero, never two. | +| **Invariant** | `Always`: after retry with identical IdempotencyToken, the shard's upper reflects exactly one successful write. Duplicate data must never appear in the shard trace. | +| **Antithesis Angle** | Inject network failures on consensus calls mid-flight. Kill writer after batch is queued but before state is committed. Antithesis explores the window between consensus write and acknowledgment. | +| **Why It Matters** | Indeterminate errors are the hardest to handle correctly in distributed systems. Duplication or loss here silently corrupts downstream materialized views. Surfaced by: Data Integrity. | + +## Category 2: Consistency Model Enforcement + +Properties that verify Materialize's strict serializability guarantee and timestamp oracle correctness. + +### strict-serializable-reads — Reads Respect Timestamp Oracle Linearization + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P0 — Materialize's core advertised guarantee; user-visible | +| **Property** | Two reads on the same collection at timestamps t1 < t2 (assigned by the oracle) must observe consistent ordering: if t1 sees state S, t2 cannot observe a state prior to S. | +| **Invariant** | `Always`: for any two reads where oracle assigns t1 < t2, the result at t2 must include all changes visible at t1. The oracle read timestamp must advance monotonically. | +| **Antithesis Angle** | Run parallel transactions in StrictSerializable mode. One writes, another reads concurrently. Inject delays in oracle timestamp advancement. Antithesis explores whether reads can bypass the linearization point. | +| **Why It Matters** | Strict serializability is Materialize's core advertised guarantee. Users explicitly choose it over eventual consistency. Violation is a correctness bug visible to end users. Surfaced by: Protocol Contracts. | + +### catalog-recovery-consistency — Catalog State Consistent After Crash Recovery + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P1 — catalog corruption on recovery prevents system from starting | +| **Property** | After coordinator crash and restart, the recovered catalog state is equivalent to the pre-crash state: upper never decreases, snapshot is consolidated, and all committed transactions are visible. | +| **Invariant** | `Always`: upper(post_restart) >= upper(pre_crash). After sync_to_current_upper(), the snapshot contains no unconsolidated entries (all diffs resolved). | +| **Antithesis Angle** | Crash coordinator during catalog_transact (after some updates persist but before upper advances). Crash during consolidation. Antithesis explores the timing of crashes within the catalog write path. | +| **Why It Matters** | Catalog inconsistency after recovery can cause schema corruption, lost DDL, or inability to restart. Surfaced by: Failure Recovery. | + +## Category 3: Compute and Storage Recovery + +Properties that verify correct behavior during and after process crashes in the compute and storage layers. + +### compute-replica-epoch-isolation — Stale Replica Commands Rejected After Rehydration + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P1 — stale commands cause compute divergence and wrong query results | +| **Property** | Each compute replica incarnation has a unique epoch (nonce + u64). After rehydration with epoch N+1, no commands from epoch N can execute or affect dataflow state. | +| **Invariant** | `Always`: once a command with epoch N+1 is processed, all epoch N commands are dropped. The epoch forms a strict ordering on replica incarnations. | +| **Antithesis Angle** | Kill compute replica mid-dataflow. Controller rehydrates with new epoch. In-flight commands from the old epoch leak back due to network buffering. Antithesis explores whether stale commands can sneak past the epoch check. | +| **Why It Matters** | Stale command execution causes compute replicas to diverge from the coordinator's expected state, potentially returning wrong query results. Surfaced by: Distributed Coordination. | + +### storage-command-replay-idempotent — Storage Command History Replay Is Idempotent + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P1 — non-idempotent replay causes data duplication in all downstream MVs | +| **Property** | When a storage replica reconnects, the controller replays command history from the last frontier. Replaying the same commands twice yields identical state — no duplicated ingestion or state divergence. | +| **Invariant** | `Always`: apply(history[0:i]) + apply(history[0:i]) == apply(history[0:i]). Source ingestion positions must resume from persisted offsets, not restart from zero. | +| **Antithesis Angle** | Crash storage controller mid-send of RunIngestionCommand. Restart and replay history. Antithesis explores whether partial command delivery causes duplicate ingestion. | +| **Why It Matters** | Non-idempotent replay causes duplicate data in sources, which propagates to all downstream materialized views. Surfaced by: Failure Recovery. | + +## Category 4: Concurrency and Race Conditions + +Properties that verify correctness under concurrent access patterns within the coordinator. + +### group-commit-toctou-safety — No Phantom Writes to Deleted Tables + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P1 — phantom writes corrupt catalog; TOCTOU explicitly acknowledged in code | +| **Property** | If a table is deleted between when a write is deferred and when group_commit executes, the write is silently dropped — not persisted. No phantom writes to non-existent tables. | +| **Invariant** | `Always`: if catalog.try_get_entry(table_id) returns None at group_commit time, the write's updates are not included in the committed batch. | +| **Antithesis Angle** | Concurrent table deletion + write operations. Antithesis delays between deferred write queuing and group_commit catalog check, exposing the TOCTOU window where the table ceases to exist between validation and execution. | +| **Why It Matters** | Phantom writes to deleted tables corrupt the catalog or cause panics during downstream processing. The explicit TOCTOU check in appends.rs:479-486 acknowledges this risk. Surfaced by: Concurrency. | + +### peek-lifecycle-exactly-once — Each Peek Gets Exactly One Response + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P1 — leaked peeks cause OOM; explicit 1:1 contract documented | +| **Property** | For each peek command sent to compute, exactly one PeekResponse is delivered to the client — no duplicates, no missing responses, no orphaned pending_peeks entries. | +| **Invariant** | `Always`: count(peek_commands) == count(peek_responses) with bijective UUID mapping. When CancelPendingPeeks races with PeekNotification, exactly one of (canceled, completed) occurs — never both, never neither. | +| **Antithesis Angle** | Trigger replica failures mid-peek. Race cancel requests with response delivery. Antithesis explores the two-map removal sequence (client_pending_peeks + pending_peeks) that is not atomic. | +| **Why It Matters** | Leaked peeks cause memory growth and eventually OOM. Duplicate responses confuse clients. The 1:1 contract is explicitly documented in peek.rs:80-95. Surfaced by: Protocol Contracts, Concurrency. | + +### command-channel-ordering — Timely Workers See Commands in Identical Order + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P2 — code explicitly acknowledges ordering is unguaranteed; hard to trigger | +| **Property** | CreateDataflow commands broadcast through the command channel execute in identical order across all Timely workers — no reordering. | +| **Invariant** | `Always`: for any two workers W1 and W2, if W1 sees command A before B, W2 also sees A before B. Code comment at command_channel.rs:88-90 explicitly notes this relies on "Timely channels preserving order of inputs, which is not something they guarantee." | +| **Antithesis Angle** | Inject timing delays in the source operator between command channel invocations. Stress the sync_activator bridge between sync and async contexts. Antithesis explores whether worker scheduling variations cause reordering. | +| **Why It Matters** | Command reordering causes workers to diverge, producing inconsistent dataflow results. The code explicitly acknowledges this is unguaranteed. Surfaced by: Concurrency. | + +## Category 5: Lifecycle Transitions + +Properties about 0DT deployment, startup, and shutdown correctness. + +### deployment-promotion-safety — 0DT Promotion Only After Full Catchup + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P2 — relevant for cloud deployments; requires multi-coordinator setup | +| **Property** | During 0DT deployment, the new coordinator transitions to ReadyToPromote only after catalog is loaded, caught-up checks pass, and all replica frontiers have advanced past the required threshold. Promotion with stale replicas is prevented. | +| **Invariant** | `Always`: at the moment set_ready_to_promote() is called, all collections tracked by caught_up checks have frontiers >= the cutoff threshold. The deployment generation fence prevents the old coordinator from writing after promotion. | +| **Antithesis Angle** | Trigger preflight concurrently with replica startup. Crash replicas during catchup. Antithesis explores whether the caught_up check can pass while a replica is still lagging or crash-looping. | +| **Why It Matters** | Premature promotion causes the new coordinator to serve stale data or fail to serve at all. This is the primary risk in zero-downtime deployments. Surfaced by: Lifecycle, Distributed Coordination. | + +### deployment-lag-detection — Caught-Up Check Detects Stuck Replicas + +| | | +|---|---| +| **Type** | Liveness | +| **Priority** | P2 — companion to deployment-promotion-safety; requires 0DT setup | +| **Property** | During 0DT catchup, maybe_check_caught_up() eventually detects replicas that are lagging beyond configured thresholds or crash-looping, and prevents promotion until resolved. | +| **Invariant** | `Sometimes(lagging_replica_detected)`: Antithesis should observe at least one scenario where a lagging/crashing replica is detected and promotion is blocked. This is a liveness property — the detection must eventually happen. | +| **Antithesis Angle** | Inject replica crashes during catchup phase. Verify the analyze_replica_looping() function identifies the problem via mz_cluster_replica_status_history. | +| **Why It Matters** | Undetected stuck replicas during 0DT deployment lead to silent data staleness in production. Surfaced by: Lifecycle. | + +## Category 6: Reachability and Coverage + +Properties that verify the system reaches interesting states under fault injection. + +### fault-recovery-exercised — System Recovers from Coordinator Crash + +| | | +|---|---| +| **Type** | Liveness | +| **Priority** | P0 — most fundamental operational property; prerequisite for all others | +| **Property** | After the coordinator (environmentd) crashes and restarts, the system eventually becomes healthy (readiness endpoint returns 200) and can serve SQL queries. | +| **Invariant** | `Sometimes(healthy_after_crash)`: the system must reach a state where it can serve queries after a crash. This confirms recovery works end-to-end, not just in unit tests. | +| **Antithesis Angle** | Kill environmentd at various points during operation. Verify it restarts, reconnects to persist, recovers catalog, and serves queries. Antithesis explores crash timing — during DDL, during peek, during group_commit. | +| **Why It Matters** | Recovery is the most critical operational property. If it doesn't work, nothing else matters. Surfaced by: Failure Recovery. | + +### source-ingestion-progress — Kafka Source Eventually Catches Up + +| | | +|---|---| +| **Type** | Liveness | +| **Priority** | P2 — important but requires Kafka/Redpanda in topology | +| **Property** | After creating a Kafka source, Materialize eventually ingests all available data and the source's write frontier advances past the data's timestamps. | +| **Invariant** | `Sometimes(source_frontier_advances)`: the source's upper antichain must advance at least once during the test run, confirming data is flowing through the ingestion pipeline. | +| **Antithesis Angle** | Create a Kafka source, produce messages, then inject network faults between Materialize and Redpanda. Verify the source eventually catches up when connectivity is restored. | +| **Why It Matters** | Source ingestion is the primary data path. If it stalls, all downstream materialized views stop updating. Surfaced by: Product Context. | + +### mv-reflects-source-updates — Materialized Views Eventually Reflect Source Changes + +| | | +|---|---| +| **Type** | Liveness | +| **Priority** | P1 — end-to-end user-visible correctness; Materialize's core value | +| **Property** | After data is written to a source, materialized views that depend on that source eventually reflect the new data. | +| **Invariant** | `Sometimes(mv_contains_new_data)`: after inserting data into a table or producing to a Kafka source, a SELECT on a dependent materialized view must eventually return the new data. | +| **Antithesis Angle** | Insert data, inject faults (compute replica crash, storage reconnection), then verify the MV eventually shows the data. Antithesis explores whether faults during the incremental update pipeline cause permanent stalls. | +| **Why It Matters** | This is the end-to-end user-visible correctness property. Materialize's value proposition is that MVs are always up-to-date. Surfaced by: Product Context. | + +### critical-reader-fence-linearization — Critical Reader Opaque Token Linearizes + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P1 — incorrect fencing allows premature GC causing data loss | +| **Property** | When two concurrent critical readers attempt compare_and_downgrade_since with mismatched opaque tokens, exactly one succeeds in updating the shard's since. No reader can re-observe an old opaque value after a SeqNo increment. | +| **Invariant** | `Always`: concurrent compare_and_downgrade_since operations with different opaques result in exactly one mutation. The winner's opaque is durably recorded; the loser gets a mismatch. | +| **Antithesis Angle** | Inject network delays between state check and state commit. Fail CaS operations after token comparison but before state write. Antithesis explores concurrent reader contention. | +| **Why It Matters** | Critical readers control garbage collection boundaries. Incorrect fencing allows premature GC, which deletes data needed by active readers. Surfaced by: Data Integrity. | diff --git a/antithesis/scratchbook/property-relationships.md b/antithesis/scratchbook/property-relationships.md new file mode 100644 index 0000000000000..4df508acd6f03 --- /dev/null +++ b/antithesis/scratchbook/property-relationships.md @@ -0,0 +1,56 @@ +# Property Relationships + +## Cluster 1: Persist Layer Safety + +**Properties**: `persist-cas-monotonicity`, `tombstone-sealing-finality`, `idempotent-write-under-indeterminate`, `critical-reader-fence-linearization` + +These properties share the persist state machine code in `src/persist-client/src/internal/`. They all exercise the compare-and-swap loop in `Machine` and the `State` transitions. A bug in the CaS loop or state validation could violate multiple properties simultaneously. + +**Suspected dominance**: `persist-cas-monotonicity` is foundational — if SeqNo monotonicity breaks, all other persist properties likely break too. It dominates `tombstone-sealing-finality` and `critical-reader-fence-linearization`. + +## Cluster 2: Fencing and Split-Brain Prevention + +**Properties**: `epoch-fencing-prevents-split-brain`, `compute-replica-epoch-isolation`, `deployment-promotion-safety` + +These properties all use epoch-based fencing to prevent stale actors from mutating state. They share the pattern of "increment epoch on new incarnation, reject operations from old epoch." The catalog fencing (`epoch-fencing-prevents-split-brain`) and deployment fencing (`deployment-promotion-safety`) share code paths in `src/catalog/src/durable/persist.rs`. + +**Suspected dominance**: `epoch-fencing-prevents-split-brain` is the most fundamental — it protects the catalog. `deployment-promotion-safety` builds on it by also requiring caught-up checks before promotion. `compute-replica-epoch-isolation` is independent (different epoch mechanism for compute). + +## Cluster 3: Crash Recovery Pipeline + +**Properties**: `catalog-recovery-consistency`, `storage-command-replay-idempotent`, `fault-recovery-exercised` + +These properties test the recovery path after process crashes. `fault-recovery-exercised` is the end-to-end liveness check; `catalog-recovery-consistency` and `storage-command-replay-idempotent` test specific subsystems within recovery. + +**Suspected dominance**: `fault-recovery-exercised` is the weakest check (just "system comes back"). `catalog-recovery-consistency` is strictly stronger (catalog state is correct after recovery). If catalog recovery fails, the end-to-end recovery also fails. + +## Cluster 4: Consistency Model + +**Properties**: `strict-serializable-reads`, `mv-reflects-source-updates`, `source-ingestion-progress` + +These properties form a chain: source ingestion feeds materialized views, which serve reads. `strict-serializable-reads` depends on correct timestamp oracle behavior and frontier management. If `source-ingestion-progress` fails (data doesn't flow), `mv-reflects-source-updates` also fails, but `strict-serializable-reads` could still pass on stale but consistent data. + +**Suspected dominance**: `strict-serializable-reads` is independent of the liveness properties. `mv-reflects-source-updates` implies `source-ingestion-progress` (if MVs update, sources must have made progress). + +## Cluster 5: Coordinator Concurrency + +**Properties**: `group-commit-toctou-safety`, `peek-lifecycle-exactly-once`, `command-channel-ordering` + +These properties target different concurrency mechanisms within the coordinator and compute engine. They share the coordinator's event loop as the execution context but test independent subsystems (write path, read path, command dispatch). + +**No dominance**: These properties are independent of each other. A bug in peek handling doesn't imply a bug in group_commit or command channels. + +## Cluster 6: Deployment Lifecycle + +**Properties**: `deployment-promotion-safety`, `deployment-lag-detection` + +Both test the 0DT deployment pipeline. `deployment-lag-detection` is a prerequisite for `deployment-promotion-safety` — if lag detection fails, promotion may proceed unsafely. + +**Suspected dominance**: `deployment-promotion-safety` is stronger — it requires both lag detection and correct fencing. `deployment-lag-detection` is a liveness check on a subsystem of the promotion pipeline. + +## Cross-Cluster Connections + +- `epoch-fencing-prevents-split-brain` (Cluster 2) protects `catalog-recovery-consistency` (Cluster 3) — fencing ensures only one writer during recovery +- `persist-cas-monotonicity` (Cluster 1) underpins `catalog-recovery-consistency` (Cluster 3) — catalog is stored in persist, so CaS correctness is a prerequisite +- `strict-serializable-reads` (Cluster 4) depends on `epoch-fencing-prevents-split-brain` (Cluster 2) — split-brain would allow inconsistent timestamp assignments +- `idempotent-write-under-indeterminate` (Cluster 1) protects `storage-command-replay-idempotent` (Cluster 3) — storage ingestion uses persist writes, so idempotency matters for both diff --git a/antithesis/scratchbook/sut-analysis.md b/antithesis/scratchbook/sut-analysis.md new file mode 100644 index 0000000000000..a0ff7561eed5e --- /dev/null +++ b/antithesis/scratchbook/sut-analysis.md @@ -0,0 +1,217 @@ +# SUT Analysis: Materialize + +## System Overview + +Materialize is a real-time data integration platform and streaming SQL database written primarily in Rust. It reads change data from PostgreSQL (logical replication), MySQL, Kafka/Redpanda, and webhooks, then maintains materialized views incrementally using differential dataflow. It speaks the PostgreSQL wire protocol, so any psql client or Postgres driver can connect. + +The system claims **strict serializability** for interactive queries and provides **incremental, consistent, low-latency** results over streaming data. It does not offer approximate answers or eventual consistency. + +## Architecture + +### Three-Layer Design + +Materialize is organized into three logical layers that run as separate processes: + +**1. Adapter Layer (environmentd)** +- Main coordinator process (`src/environmentd/`) +- Hosts pgwire server (port 6875), HTTP API (6878), and internal coordination endpoints +- Parses SQL, plans queries, manages sessions, enforces consistency +- Contains the Catalog (schema metadata) in memory, persisted to durable storage +- Runs a **single-threaded async event loop** on a Tokio runtime for coordination +- Multiplexes ComputeController and StorageController to manage downstream clusters + +**2. Compute Layer (clusterd - compute)** +- Worker processes running Timely Dataflow engines (`src/compute*/`, `src/clusterd/`) +- Executes views, maintains materialized views, performs joins +- Stateless — can be rehydrated from storage on crash +- Multiple replicas provide active replication for HA +- Workers parallelize via native OS threads (one per Timely worker) + +**3. Storage Layer (clusterd - storage)** +- Worker processes for data ingestion (`src/storage*/`) +- Reads from external sources (Kafka, Postgres CDC, MySQL, webhooks) +- Reclocks source timestamps to Materialize's internal timeline +- Writes to Persist (blob storage + consensus) for durability +- Manages sinks (Kafka sinks with exactly-once semantics) + +### Communication Protocols + +| Path | Protocol | Details | +|------|----------|---------| +| Client -> Balancerd -> Environmentd | pgwire (PostgreSQL wire protocol) | TLS, port 6875 | +| Environmentd -> Clusterd | CTP (Cluster Transport Protocol) | Length-prefixed bincode over TCP/UDS, ports 2100-2101 | +| Clusterd workers <-> workers | Timely mesh | Generation-epoch protocol, ports 2102-2103 | +| Clusterd -> Persist | HTTP/S3 API | Blob storage writes + consensus CaS | +| Environmentd -> Persist | Direct | Catalog stored in persist shard | +| Clusterd -> Environmentd | Persist PubSub | HTTP on port 6879, state change subscriptions | + +### Key Entrypoints + +- `src/environmentd/src/environmentd/main.rs` — main server startup +- `src/clusterd/src/bin/clusterd.rs` — compute/storage worker startup +- `src/balancerd/` — stateless connection router +- `src/pgwire/` — PostgreSQL wire protocol implementation +- `src/adapter/` — SQL planning, coordination, session management + +## State Management + +### Five Tiers of State + +1. **Catalog metadata** — table/view/source/sink definitions, roles, clusters + - Stored in a persist shard (blob + consensus) + - Reconstructed into `CatalogState` in-memory on startup + - Mutated via `catalog_transact()` with atomic `TransactionBatch` writes + +2. **Source/ingestion data** — rows from Kafka, Postgres CDC, MySQL, webhooks + - Written to persist shards by storage workers + - Keyed by Materialize-assigned timestamps (reclocked from source timestamps) + +3. **Materialized view data** — output of incrementally-maintained computations + - Written to persist shards by compute workers + - Stored as columnar batches in blob storage + +4. **Timestamps/frontiers** — read/write boundaries tracking collection completeness + - `since` (read frontier): minimum time a collection can be read + - `upper` (write frontier): maximum time written + - Tracked as `Antichain` lattice values + - Global timestamp oracle provides causally-consistent read times + +5. **In-flight state** — active dataflow computations, pending peeks, session state + - Held in memory by compute/storage workers and the coordinator + - Lost on crash, recovered via replay from persist + +### Persistence Architecture + +**Blob Storage (S3/MinIO/Azure/Postgres-backed):** +- Immutable data batches (columnar Parquet/Arrow format) +- Rollups (periodic snapshots of shard state for fast recovery) + +**Consensus (CockroachDB/PostgreSQL/FoundationDB):** +- Shard metadata: `since`, `upper`, spine structure +- Writer/reader leases with heartbeats +- Sequence numbers (`SeqNo`) for version linearity +- Catalog mutations as `StateUpdate` events + +**Atomic Writes:** +- Compare-and-append via `Machine`: writers must match expected `upper` antichain +- Idempotency tokens prevent duplicates on retries +- Fencing via `FenceToken` (deploy generation + epoch) prevents split-brain + +## Concurrency Model + +### Coordinator (environmentd) +- **Single-threaded event loop** on Tokio runtime +- Processes commands via `tokio::select!` from multiple MPSC channels +- Per-object write locks (`Arc>`) serialize DDL to same object +- Catalog shared as `Arc` for read-only off-thread access; mutations are serialized through the event loop +- Timeline state (`global_timelines`) accessed serially within event loop + +### Compute/Storage Workers (clusterd) +- One native OS thread per Timely worker (configurable count) +- Workers coordinate via Timely's internal barriers and distributed snapshot semantics +- Commands received via MPSC channels from controllers +- Worker 0 broadcasts commands to other workers per Timely conventions + +### Synchronization Primitives +- `Arc` for per-object write locks +- `mpsc::UnboundedSender/Receiver` for coordinator internal messaging +- `watch::Sender/Receiver` for per-connection cancellation +- `Arc` (std) for low-contention shared state (metrics, log writers) +- Timely's own worker-to-worker channels for dataflow coordination + +## Safety and Liveness Guarantees + +### Claimed Safety Guarantees + +1. **Strict Serializability** (design doc 20220516): "Transactions in Materialize are strictly serializable with respect to operations inside of Materialize" (SELECT, INSERT, UPDATE, DELETE). All timestamp transitions made durable before response issued. + +2. **Definiteness** (design doc 20210831): Collections are "definite" — all uses yield exactly the same time-varying data at each logical time. Data definite for times in range `[since, upper)`. + +3. **Exactly-Once Kafka Sinks** (design doc 20200520): Transactional consistency for Kafka sink output with consistency topic. + +4. **Acknowledged Writes Survive Failures**: All data written to persist (blob + consensus) before acknowledgment. Catalog mutations durable before response. + +5. **Epoch-Based Leader Fencing**: New coordinators increment epoch on startup; old coordinators' transactions fail. Prevents split-brain after coordinator crash. + +### Claimed Liveness Guarantees + +1. **Persist Reader/Writer Liveness**: "At least one reader/writer can always make progress" even when peers are paused or restarted. + +2. **Collection Progress**: "The collection upper advances so long as one writer can make progress." + +3. **Active Replication Recovery**: "Masking of recovery delay can only be guaranteed when compute controller can reach at least one non-faulty replica." + +4. **Automatic Failover**: Compute replicas automatically rehydrate from storage on crash. Multiple replicas mask recovery latency. + +### Limitations +- HA (multi-active replication) is cloud-only; self-managed has single coordinator +- SUBSCRIBE, sinks, and `AS OF` queries may circumvent strict serializability +- No byzantine fault tolerance; system assumes honest coordinator +- Single coordinator bottleneck for timestamp oracle + +## Failure and Degradation Modes + +### Failure-Prone Areas + +1. **Startup/Configuration**: Many `expect()`/`unwrap()` calls in startup path — misconfiguration causes immediate crash rather than degraded operation. + +2. **Replica Reconnection**: Infinite retry with exponential backoff (capped at 1s). Can cause minutes-long recovery latency during transient failures. No circuit breakers. + +3. **Persist Layer Failures**: No circuit breaker for blob/consensus unavailability. System retries with backoff, creating backpressure rather than failing fast. Bounded retry loops (3-5 attempts) for some storage management operations. + +4. **0DT Deployment**: Preflight checks with configurable timeout. Can either panic or proceed degraded if standby doesn't catch up. Read-only promotion before full read-write. + +### Health Checking +- `/health/liveness` — always returns 200 (process is alive) +- `/health/ready` — returns 503 until adapter client available; optional `wait=true` blocks +- `curl localhost:6878/api/readyz` used in Docker healthchecks + +### Graceful Degradation +- Compute replicas: partial replica failure tolerated; system serves from remaining replicas +- 0DT standby boots read-only, promotes after catching up +- Feature flags return 503 rather than crashing when disabled +- No graceful degradation for metadata store (CRDB/PG) unavailability — system halts + +## External Dependencies + +| Dependency | Role | Criticality | +|-----------|------|-------------| +| CockroachDB / PostgreSQL / FoundationDB | Consensus for persist + catalog | CRITICAL — system halts without it | +| S3 / MinIO / Azure Blob | Blob storage for persist data | CRITICAL — writes fail without it | +| Kafka / Redpanda | Stream source ingestion | CRITICAL for streaming workflows | +| PostgreSQL (source) | CDC replication source | CRITICAL for CDC workflows | +| MySQL (source) | CDC replication source | Optional | +| Schema Registry | Avro/Protobuf schema management | Required for typed Kafka sources | +| Balancerd | pgwire connection routing | CRITICAL for multi-tenant | + +## Existing Test Strategy + +### mzcompose Framework (`misc/python/materialize/mzcompose/`) +- Meta-test framework generating Docker Compose files dynamically +- `Composition` class loads `mzcompose.py` files, discovers `workflow_*()` functions +- Pre-built service classes: `Materialized`, `Clusterd`, `Kafka`, `Redpanda`, `Postgres`, `CockroachOrPostgresMetadata`, `Minio`, `Toxiproxy`, etc. +- Granular lifecycle control: `c.up()`, `c.kill()`, `c.stop()`, `c.pause()`, `c.override()` +- Generates YAML on-demand, passes to `docker compose` via file descriptors +- Health-check driven startup with configurable intervals + +### Test Frameworks +1. **testdrive (.td)** — declarative SQL test language with timeout assertions and version-conditional tests +2. **sqllogictest (.slt)** — standard SQL logic test format for correctness +3. **Platform Checks** — "write once, run everywhere" tests across upgrade/restart/failure scenarios +4. **parallel-workload** — random concurrent SQL operations stress testing + +### Failure Testing Coverage +**Tested**: clusterd crashes/recovery, CockroachDB restarts, network faults (Toxiproxy), failpoint injection, statement timeouts, source/sink resilience, 0DT deployments + +**Not tested at scale**: coordinated multi-node cascading failures, deterministic replay of timing-sensitive bugs, property-based invariant testing under adversarial fault injection — this is where Antithesis adds value + +## Assumptions +- The mzcompose-based Docker Compose approach is the right integration path (vs. K8s) +- The existing Antithesis K8s-based experiment scripts represent an older approach to be superseded +- Materialize's self-managed/community edition (single-node) is the target, not the cloud multi-tenant version + +## Open Questions +- Which mzcompose test suite(s) provide the best starting workload? (platform-checks, parallel-workload, or custom) +- What is the preferred metadata store for Antithesis testing — CockroachDB or PostgreSQL? +- Should we test with multiple compute replicas or single replica? +- Are there specific failure scenarios the Materialize team wants prioritized? diff --git a/test/antithesis/export-compose.py b/test/antithesis/export-compose.py new file mode 100644 index 0000000000000..61334a63cc461 --- /dev/null +++ b/test/antithesis/export-compose.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python3 +"""Export the resolved docker-compose YAML for the Antithesis composition. + +Loads the mzcompose composition and dumps the compose dict to stdout as +YAML — without building any images or requiring a running Docker daemon. + +mzbuild references are replaced with public images where possible, +or local tags for images that must be built (e.g. the workload). + +Usage: + bin/pyactivate test/antithesis/export-compose.py > antithesis/config/docker-compose.yaml +""" + +import sys +from pathlib import Path + +import yaml + +from materialize.mzbuild import Repository +from materialize.mzcompose.composition import Composition + +# Map mzbuild names → image references for the Antithesis compose. +# Public images for infra; local build tag for the workload. +MZBUILD_TO_IMAGE = { + "materialized": "materialize/materialized:latest", + "postgres": "postgres:17.7", + "minio": "minio/minio:latest", + "antithesis-workload": "materialize-workload:latest", +} + +repo = Repository(Path("."), arch="x86_64") +c = Composition(repo, "antithesis", munge_services=False) + +for name, svc in c.compose["services"].items(): + svc["platform"] = "linux/amd64" + + if "mzbuild" in svc: + mzbuild_name = svc.pop("mzbuild") + if mzbuild_name not in MZBUILD_TO_IMAGE: + print( + f"warning: no image mapping for mzbuild {mzbuild_name!r}, " + f"using {mzbuild_name}:latest", + file=sys.stderr, + ) + svc["image"] = f"{mzbuild_name}:latest" + else: + svc["image"] = MZBUILD_TO_IMAGE[mzbuild_name] + + # Vanilla postgres needs trust auth to match the mzbuild image behavior + # (materialized connects as root with no password) + if svc.get("image", "").startswith("postgres:"): + svc.setdefault("environment", []).append("POSTGRES_HOST_AUTH_METHOD=trust") + + # Drop mzcompose-only keys that docker/podman compose doesn't understand + for key in ["propagate_uid_gid", "allow_host_ports", "publish"]: + svc.pop(key, None) + +yaml.dump(c.compose, sys.stdout, default_flow_style=False, sort_keys=False) diff --git a/test/antithesis/mzcompose.py b/test/antithesis/mzcompose.py new file mode 100644 index 0000000000000..d84b0f0108bd5 --- /dev/null +++ b/test/antithesis/mzcompose.py @@ -0,0 +1,88 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +""" +Antithesis test composition for Materialize. + +Defines the minimal topology needed to exercise Materialize under Antithesis: + - postgres-metadata: consensus/catalog store + - minio: S3-compatible blob storage for persist + - redpanda: Kafka-compatible broker for source ingestion + - materialized: the SUT (embedded clusterd mode) + - workload: Python test driver with Antithesis SDK + +Usage: + bin/mzcompose --find antithesis run default # bring up the cluster + bin/mzcompose --find antithesis run export-compose # dump compose YAML +""" + +import sys + +import yaml + +from materialize.mzcompose.composition import Composition, WorkflowArgumentParser +from materialize.mzcompose.service import Service, ServiceConfig +from materialize.mzcompose.services.materialized import Materialized +from materialize.mzcompose.services.minio import Minio +from materialize.mzcompose.services.postgres import PostgresMetadata +from materialize.mzcompose.services.redpanda import Redpanda + + +class Workload(Service): + """Antithesis workload client — Python test driver.""" + + def __init__(self) -> None: + config: ServiceConfig = { + "mzbuild": "antithesis-workload", + "depends_on": { + "materialized": {"condition": "service_healthy"}, + "redpanda": {"condition": "service_healthy"}, + }, + "environment": [ + "PGHOST=materialized", + "PGPORT=6875", + "PGUSER=materialize", + "KAFKA_BROKER=kafka:9092", + "SCHEMA_REGISTRY_URL=http://schema-registry:8081", + ], + } + super().__init__(name="workload", config=config) + + +SERVICES = [ + PostgresMetadata(), + Minio(setup_materialize=True), + Redpanda(auto_create_topics=True), + Materialized( + external_blob_store=True, + external_metadata_store=True, + metadata_store="postgres-metadata", + unsafe_mode=True, + soft_assertions=True, + sanity_restart=False, + ), + Workload(), +] + + +def workflow_default(c: Composition) -> None: + """Bring up the Antithesis test cluster.""" + c.up("postgres-metadata", "minio", "redpanda") + c.up("materialized") + c.up("workload") + + +def workflow_export_compose(c: Composition) -> None: + """Export the resolved docker-compose YAML to stdout. + + Usage: + bin/mzcompose --find antithesis run export-compose > antithesis/config/docker-compose.yaml + """ + # c.compose is the fully-resolved compose dict (mzbuild: replaced with image:) + yaml.dump(c.compose, sys.stdout, default_flow_style=False, sort_keys=False) diff --git a/test/antithesis/workload/Dockerfile b/test/antithesis/workload/Dockerfile new file mode 100644 index 0000000000000..804cb1b3009ec --- /dev/null +++ b/test/antithesis/workload/Dockerfile @@ -0,0 +1,34 @@ +# Antithesis workload client for Materialize. +# +# Python-based test driver that connects to materialized via pgwire, +# produces Kafka messages, and emits Antithesis assertions. + +FROM python:3.12-slim-bookworm + +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl \ + postgresql-client \ + && rm -rf /var/lib/apt/lists/* + +RUN pip install --no-cache-dir \ + psycopg[binary]==3.2.9 \ + confluent-kafka==2.8.0 \ + antithesis==0.2.0 + +# setup-complete script +COPY setup-complete.sh /usr/local/bin/setup-complete.sh +RUN chmod +x /usr/local/bin/setup-complete.sh + +# Test template directory — populated by antithesis-workload skill later +RUN mkdir -p /opt/antithesis/test/v1/materialize + +# Catalog directory for Python assertion cataloging +RUN mkdir -p /opt/antithesis/catalog + +# Copy test templates and entrypoint +COPY test/ /opt/antithesis/test/v1/materialize/ +COPY workload-entrypoint.sh /usr/local/bin/workload-entrypoint.sh +RUN chmod +x /usr/local/bin/workload-entrypoint.sh +RUN chmod +x /opt/antithesis/test/v1/materialize/* 2>/dev/null || true + +ENTRYPOINT ["/usr/local/bin/workload-entrypoint.sh"] diff --git a/test/antithesis/workload/mzbuild.yml b/test/antithesis/workload/mzbuild.yml new file mode 100644 index 0000000000000..beed6bf84e93b --- /dev/null +++ b/test/antithesis/workload/mzbuild.yml @@ -0,0 +1 @@ +name: antithesis-workload diff --git a/test/antithesis/workload/setup-complete.sh b/test/antithesis/workload/setup-complete.sh new file mode 100755 index 0000000000000..59384ae9ba2b4 --- /dev/null +++ b/test/antithesis/workload/setup-complete.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Run this script to inform Antithesis that it can start running Test Composer +# Commands. You can also use the Antithesis SDK to emit setup-complete from your +# system if that is easier. +# +# Antithesis sets the `ANTITHESIS_OUTPUT_DIR` environment variable +# automatically. This script is setup to emit `setup_complete` to the +# `sdk.jsonl` file in that directory. + +OUTPUT_PATH="/tmp/antithesis_sdk.jsonl" +if [[ -n "${ANTITHESIS_OUTPUT_DIR:-}" ]]; then + OUTPUT_PATH="${ANTITHESIS_OUTPUT_DIR}/sdk.jsonl" + echo "Running in Antithesis, emitting setup_complete to ${OUTPUT_PATH}" +elif [[ -n "${ANTITHESIS_SDK_LOCAL_OUTPUT:-}" ]]; then + OUTPUT_PATH="${ANTITHESIS_SDK_LOCAL_OUTPUT}" + echo "Antithesis SDK local output override detected, emitting setup_complete to ${OUTPUT_PATH}" +fi + +mkdir -p $(dirname "$OUTPUT_PATH") +echo '{"antithesis_setup":{"status":"complete","details":{"message":"ready to go"}}}' >> "${OUTPUT_PATH}" diff --git a/test/antithesis/workload/test/anytime_health_check.sh b/test/antithesis/workload/test/anytime_health_check.sh new file mode 100755 index 0000000000000..f3feefe5a402e --- /dev/null +++ b/test/antithesis/workload/test/anytime_health_check.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Basic health check — verifies materialized is responding to SQL. +# This is a minimal placeholder; the antithesis-workload skill will add +# real test commands with property assertions. + +PGHOST="${PGHOST:-materialized}" +PGPORT="${PGPORT:-6875}" +PGUSER="${PGUSER:-materialize}" + +result=$(psql -h "$PGHOST" -p "$PGPORT" -U "$PGUSER" -tAc "SELECT 1" 2>&1) +if [ "$result" = "1" ]; then + echo "Health check passed" + exit 0 +else + echo "Health check failed: $result" + exit 1 +fi diff --git a/test/antithesis/workload/workload-entrypoint.sh b/test/antithesis/workload/workload-entrypoint.sh new file mode 100755 index 0000000000000..0f5b012c3ad9e --- /dev/null +++ b/test/antithesis/workload/workload-entrypoint.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Wait for materialized to be ready before signaling setup_complete. +echo "Waiting for materialized to become healthy..." +until curl -sf http://materialized:6878/api/readyz > /dev/null 2>&1; do + sleep 1 +done +echo "materialized is healthy." + +# Emit setup_complete — Antithesis begins test commands after this. +/usr/local/bin/setup-complete.sh + +# Sleep forever — Test Composer runs the test commands, not this entrypoint. +echo "Setup complete. Sleeping while Test Composer runs commands." +exec sleep infinity From 127f67ec4b598c676c49b2ecd321b35d9f3e42c5 Mon Sep 17 00:00:00 2001 From: Mitch Wagner Date: Wed, 6 May 2026 17:47:03 -0400 Subject: [PATCH 02/65] feat: tweaks for basic_test --- antithesis/Makefile | 16 ++++-- antithesis/config/Dockerfile | 2 + antithesis/config/docker-compose.yaml | 18 ++++--- test/antithesis/export-compose.py | 50 +++++++++++++++++-- .../workload/test/anytime_health_check.sh | 2 +- 5 files changed, 73 insertions(+), 15 deletions(-) create mode 100644 antithesis/config/Dockerfile diff --git a/antithesis/Makefile b/antithesis/Makefile index d29e795d22be7..0afa1cd1f3335 100644 --- a/antithesis/Makefile +++ b/antithesis/Makefile @@ -39,17 +39,23 @@ export-compose: # --------------------------------------------------------------------------- # Build — build images that don't have public equivalents. # --------------------------------------------------------------------------- -LOCAL_IMAGES := workload +LOCAL_IMAGES := workload config BUILD_TARGETS := $(LOCAL_IMAGES:%=build-%) .PHONY: build $(BUILD_TARGETS) -build: $(BUILD_TARGETS) +build: export-compose $(BUILD_TARGETS) -$(BUILD_TARGETS): build-%: +build-workload: $(RUNTIME) build \ --platform linux/amd64 \ - -t $(PROJECT)-$*:latest \ - $(REPO_ROOT)/test/antithesis/$* + -t $(PROJECT)-workload:latest \ + $(REPO_ROOT)/test/antithesis/workload + +build-config: export-compose + $(RUNTIME) build \ + --platform linux/amd64 \ + -t $(PROJECT)-config:latest \ + config # --------------------------------------------------------------------------- # Up / Down diff --git a/antithesis/config/Dockerfile b/antithesis/config/Dockerfile new file mode 100644 index 0000000000000..fb59d4a2bd588 --- /dev/null +++ b/antithesis/config/Dockerfile @@ -0,0 +1,2 @@ +FROM scratch +COPY docker-compose.yaml / diff --git a/antithesis/config/docker-compose.yaml b/antithesis/config/docker-compose.yaml index 6eb68d6f7e789..b85c1e4d72299 100644 --- a/antithesis/config/docker-compose.yaml +++ b/antithesis/config/docker-compose.yaml @@ -15,7 +15,6 @@ services: environment: - POSTGRESDB=postgres - POSTGRES_PASSWORD=postgres - - LD_PRELOAD=libeatmydata.so - PGPORT=26257 - POSTGRES_HOST_AUTH_METHOD=trust healthcheck: @@ -27,10 +26,20 @@ services: interval: 1s start_period: 30s restart: 'no' - volumes: - - ../../misc/postgres/setup_materialize.sql:/docker-entrypoint-initdb.d/z_setup_materialize.sql platform: linux/amd64 image: postgres:17.7 + entrypoint: + - sh + - -c + - 'echo "CREATE ROLE root WITH LOGIN PASSWORD ''root'';CREATE DATABASE root;GRANT + ALL PRIVILEGES ON DATABASE root TO root;\c root;CREATE SCHEMA IF NOT EXISTS + consensus AUTHORIZATION root;CREATE SCHEMA IF NOT EXISTS adapter AUTHORIZATION + root;CREATE SCHEMA IF NOT EXISTS storage AUTHORIZATION root;CREATE SCHEMA IF + NOT EXISTS tsoracle AUTHORIZATION root;GRANT ALL PRIVILEGES ON SCHEMA public + TO root;" > /docker-entrypoint-initdb.d/z_setup_materialize.sql + + exec docker-entrypoint.sh "$$@"' + - -- minio: entrypoint: - sh @@ -129,7 +138,6 @@ services: - MZ_INTERNAL_PERSIST_PUBSUB_LISTEN_ADDR=0.0.0.0:6879 - MZ_PERSIST_PUBSUB_URL=http://127.0.0.1:6879 - MZ_AWS_CONNECTION_ROLE_ARN=arn:aws:iam::123456789000:role/MaterializeConnection - - MZ_EXTERNAL_LOGIN_PASSWORD_MZ_SYSTEM=password - MZ_AWS_EXTERNAL_ID_PREFIX=eb5cb59b-e2fe-41f3-87ca-d2176a495345 - MZ_CATALOG_STORE=persist - MZ_LOG_FILTER @@ -269,9 +277,7 @@ services: - MZ_NO_BUILTIN_POSTGRES=1 - MZ_NO_BUILTIN_COCKROACH=1 - MZ_ADAPTER_STASH_URL=postgres://root@postgres-metadata:26257?options=--search_path=adapter - - MZ_LISTENERS_CONFIG_PATH=/listeners_config volumes: - - /home/mitch/src/customer/customer-materialize/materialize/src/materialized/ci/listener_configs/testdrive.json:/listeners_config - mzdata:/mzdata - mydata:/var/lib/mysql-files - tmp:/share/tmp diff --git a/test/antithesis/export-compose.py b/test/antithesis/export-compose.py index 61334a63cc461..5b487a5485bc2 100644 --- a/test/antithesis/export-compose.py +++ b/test/antithesis/export-compose.py @@ -46,10 +46,54 @@ else: svc["image"] = MZBUILD_TO_IMAGE[mzbuild_name] - # Vanilla postgres needs trust auth to match the mzbuild image behavior - # (materialized connects as root with no password) + # Fixups for vanilla postgres (the mzbuild image has eatmydata, custom + # pg_hba.conf, and baked-in init SQL — none of which exist in the public image). if svc.get("image", "").startswith("postgres:"): - svc.setdefault("environment", []).append("POSTGRES_HOST_AUTH_METHOD=trust") + env = svc.get("environment", []) + # Remove eatmydata — not installed in vanilla postgres + env[:] = [e for e in env if not e.startswith("LD_PRELOAD=")] + # Trust auth so materialized can connect as root without a password + env.append("POSTGRES_HOST_AUTH_METHOD=trust") + # Remove host bind-mount for setup SQL — won't exist in Antithesis. + # Instead, inline the init SQL that creates the schemas materialized needs. + vols = svc.get("volumes", []) + vols[:] = [v for v in vols if "setup_materialize.sql" not in v] + if not vols: + del svc["volumes"] + # Inline the init SQL as a script volume + init_sql = ( + "CREATE ROLE root WITH LOGIN PASSWORD 'root';" + "CREATE DATABASE root;" + "GRANT ALL PRIVILEGES ON DATABASE root TO root;" + r"\c root;" + "CREATE SCHEMA IF NOT EXISTS consensus AUTHORIZATION root;" + "CREATE SCHEMA IF NOT EXISTS adapter AUTHORIZATION root;" + "CREATE SCHEMA IF NOT EXISTS storage AUTHORIZATION root;" + "CREATE SCHEMA IF NOT EXISTS tsoracle AUTHORIZATION root;" + "GRANT ALL PRIVILEGES ON SCHEMA public TO root;" + ) + svc.setdefault("entrypoint", []) + svc["entrypoint"] = ["sh", "-c", f""" +echo "{init_sql}" > /docker-entrypoint-initdb.d/z_setup_materialize.sql +exec docker-entrypoint.sh "$$@" +""".strip(), "--"] + + # Strip host bind-mounts — they won't resolve in Antithesis + if "volumes" in svc: + svc["volumes"] = [ + v for v in svc["volumes"] + if not isinstance(v, str) or ":" not in v or not v.split(":")[0].startswith("/") + ] + if not svc["volumes"]: + del svc["volumes"] + + # Remove env vars that point at host-only paths (the Docker image + # entrypoint provides sensible defaults when these are unset) + if "environment" in svc: + svc["environment"] = [ + e for e in svc["environment"] + if not e.startswith(("MZ_LISTENERS_CONFIG_PATH=", "MZ_EXTERNAL_LOGIN_PASSWORD_")) + ] # Drop mzcompose-only keys that docker/podman compose doesn't understand for key in ["propagate_uid_gid", "allow_host_ports", "publish"]: diff --git a/test/antithesis/workload/test/anytime_health_check.sh b/test/antithesis/workload/test/anytime_health_check.sh index f3feefe5a402e..f7d743ebc4cd7 100755 --- a/test/antithesis/workload/test/anytime_health_check.sh +++ b/test/antithesis/workload/test/anytime_health_check.sh @@ -9,7 +9,7 @@ PGHOST="${PGHOST:-materialized}" PGPORT="${PGPORT:-6875}" PGUSER="${PGUSER:-materialize}" -result=$(psql -h "$PGHOST" -p "$PGPORT" -U "$PGUSER" -tAc "SELECT 1" 2>&1) +result=$(psql -h "$PGHOST" -p "$PGPORT" -U "$PGUSER" -tAc "SELECT 1" 2>/dev/null) if [ "$result" = "1" ]; then echo "Health check passed" exit 0 From 8323b8a1e260109f285e81ff583137f43436d793 Mon Sep 17 00:00:00 2001 From: Mitch Wagner Date: Thu, 7 May 2026 15:04:42 -0400 Subject: [PATCH 03/65] feat: working instrumentation --- antithesis/Makefile | 22 ++++++++++-- antithesis/config/docker-compose.yaml | 2 +- bin/ci-builder | 47 +++++++++++++++++++------- ci/builder/Dockerfile | 5 +++ misc/python/materialize/mzbuild.py | 46 +++++++++++++++++-------- misc/python/materialize/rustc_flags.py | 14 ++++++++ src/materialized/ci/Dockerfile | 11 ++++++ test/antithesis/export-compose.py | 2 +- 8 files changed, 117 insertions(+), 32 deletions(-) diff --git a/antithesis/Makefile b/antithesis/Makefile index 0afa1cd1f3335..25bf6408cf927 100644 --- a/antithesis/Makefile +++ b/antithesis/Makefile @@ -21,6 +21,9 @@ endif ifeq ($(RUNTIME),none) $(error neither podman nor docker found in PATH; set RUNTIME=docker or install podman) endif +ifeq ($(RUNTIME),podman) + export MZ_DEV_CI_BUILDER_RUNTIME := podman +endif COMPOSE := $(RUNTIME) compose -p $(PROJECT) -f config/docker-compose.yaml PSQL := $(COMPOSE) exec materialized psql -h localhost -p 6875 -U materialize @@ -39,12 +42,26 @@ export-compose: # --------------------------------------------------------------------------- # Build — build images that don't have public equivalents. # --------------------------------------------------------------------------- -LOCAL_IMAGES := workload config +LOCAL_IMAGES := materialized workload config BUILD_TARGETS := $(LOCAL_IMAGES:%=build-%) -.PHONY: build $(BUILD_TARGETS) +.PHONY: build $(BUILD_TARGETS) build-builder build: export-compose $(BUILD_TARGETS) +build-builder: + cd $(REPO_ROOT) && $(REPO_ROOT)/bin/ci-builder build stable --load + @tag=$$(cd $(REPO_ROOT) && $(REPO_ROOT)/bin/ci-builder tag stable); \ + echo "Tagging materialize/ci-builder:$$tag -> $(PROJECT)-builder:latest"; \ + $(RUNTIME) tag "materialize/ci-builder:$$tag" $(PROJECT)-builder:latest + +build-materialized: build-builder + cd $(REPO_ROOT) && $(REPO_ROOT)/bin/mzimage acquire materialized --antithesis + @# Tag the mzbuild output to the name the compose file expects + @img=$$($(RUNTIME) images --format '{{.Repository}}:{{.Tag}}' \ + | grep 'materialized:mzbuild-' | head -1); \ + echo "Tagging $$img -> $(PROJECT)-materialized:latest"; \ + $(RUNTIME) tag "$$img" $(PROJECT)-materialized:latest + build-workload: $(RUNTIME) build \ --platform linux/amd64 \ @@ -85,6 +102,7 @@ test: push: @$(RUNTIME) images --format '{{.Repository}}:{{.Tag}}' \ | grep ':latest$$' | grep '^\(localhost/\)\?$(PROJECT)-' \ + | grep -v '$(PROJECT)-builder' \ | while read item; do \ nametag="$${item#localhost/}"; \ name="$${nametag%:*}"; \ diff --git a/antithesis/config/docker-compose.yaml b/antithesis/config/docker-compose.yaml index b85c1e4d72299..004fc60b245d9 100644 --- a/antithesis/config/docker-compose.yaml +++ b/antithesis/config/docker-compose.yaml @@ -294,7 +294,7 @@ services: start_period: 600s stop_grace_period: 120s platform: linux/amd64 - image: materialize/materialized:latest + image: materialize-materialized:latest workload: depends_on: materialized: diff --git a/bin/ci-builder b/bin/ci-builder index 066bf273130a9..0e81c806063d8 100755 --- a/bin/ci-builder +++ b/bin/ci-builder @@ -18,6 +18,9 @@ set -euo pipefail NIGHTLY_RUST_DATE=2026-05-06 +# Allow overriding the container runtime (e.g. MZ_DEV_CI_BUILDER_RUNTIME=podman). +DOCKER="${MZ_DEV_CI_BUILDER_RUNTIME:-docker}" + workdir=$(pwd) cd "$(dirname "$0")/.." @@ -128,10 +131,14 @@ gid=$(id -g) [[ "$gid" -lt 500 ]] && gid=$uid build() { + local cache_args=() + if [[ "$DOCKER" != "podman" ]]; then + cache_args+=(--cache-from=materialize/ci-builder:"$cache_tag") + cache_args+=(--cache-to=type=inline,mode=max) + fi # shellcheck disable=SC2086 # intentional splitting of build args string - docker buildx build --pull \ - --cache-from=materialize/ci-builder:"$cache_tag" \ - --cache-to=type=inline,mode=max \ + "$DOCKER" buildx build --pull \ + "${cache_args[@]}" \ $docker_build_args \ --tag materialize/ci-builder:"$tag" \ --tag ghcr.io/materializeinc/materialize/ci-builder:"$tag" \ @@ -181,13 +188,13 @@ case "$cmd" in build "$@" ;; exists) - docker manifest inspect "$image_registry"/ci-builder:"$tag" &> /dev/null + "$DOCKER" manifest inspect "$image_registry"/ci-builder:"$tag" &> /dev/null ;; tag) echo "$tag" ;; push) - docker login ghcr.io -u materialize-bot --password "$GITHUB_GHCR_TOKEN" + "$DOCKER" login ghcr.io -u materialize-bot --password "$GITHUB_GHCR_TOKEN" build --push "$@" ;; run) @@ -372,20 +379,26 @@ case "$cmd" in ) fi if [[ "$(uname -s)" = Linux ]]; then - args+=( - --user "$(id -u):$(stat -c %g /var/run/docker.sock)" - ) + if [[ "${MZ_DEV_CI_BUILDER_RUNTIME:-docker}" == "podman" ]]; then + args+=(--userns=keep-id) + else + args+=( + --user "$(id -u):$(stat -c %g /var/run/docker.sock)" + ) + fi if [[ $secrets == "true" ]]; then # Allow Docker-in-Docker by mounting the Docker socket in the # container. Host networking allows us to see ports created by # containers that we launch. args+=( - --volume "/var/run/docker.sock:/var/run/docker.sock" --network host --env "DOCKER_TLS_VERIFY=${DOCKER_TLS_VERIFY-}" --env "DOCKER_HOST=${DOCKER_HOST-}" ) + if [[ -S /var/run/docker.sock ]]; then + args+=(--volume "/var/run/docker.sock:/var/run/docker.sock") + fi # Forward Docker configuration too, if available. docker_dir=${DOCKER_CONFIG:-$HOME/.docker} @@ -431,14 +444,22 @@ case "$cmd" in image="$image_registry/ci-builder:$tag" # Try downloading the image a few times in case of registry flakiness if [[ "${CI:-}" ]]; then - if ! docker inspect "$image" > /dev/null 2>&1; then - docker pull "$image" || (sleep 3 && docker pull "$image") || (sleep 3 && docker pull "$image") || sleep 3 + if ! "$DOCKER" inspect "$image" > /dev/null 2>&1; then + "$DOCKER" pull "$image" || (sleep 3 && "$DOCKER" pull "$image") || (sleep 3 && "$DOCKER" pull "$image") || sleep 3 fi fi - docker run "${args[@]}" "$image" eatmydata "${docker_command[@]}" + if [[ "$DOCKER" == "podman" ]]; then + # --userns=keep-id already maps the host UID/GID into the + # container, so autouseradd is unnecessary. Override the + # entrypoint to skip it. + args+=(--entrypoint eatmydata) + "$DOCKER" run "${args[@]}" "$image" "${docker_command[@]}" + else + "$DOCKER" run "${args[@]}" "$image" eatmydata "${docker_command[@]}" + fi ;; root-shell) - docker exec --interactive --tty --user 0:0 "$(<"$cid_file")" eatmydata ci/builder/root-shell.sh + "$DOCKER" exec --interactive --tty --user 0:0 "$(<"$cid_file")" eatmydata ci/builder/root-shell.sh ;; *) printf "unknown command %q\n" "$cmd" diff --git a/ci/builder/Dockerfile b/ci/builder/Dockerfile index be1da20d8591f..eb6b71be277a4 100644 --- a/ci/builder/Dockerfile +++ b/ci/builder/Dockerfile @@ -399,6 +399,11 @@ ENV CARGO_HOME=/cargo RUN mkdir /cargo && chmod 777 /cargo VOLUME /cargo +# Antithesis coverage instrumentation library (used when --antithesis is passed) +RUN curl -sSL https://antithesis.com/assets/instrumentation/libvoidstar.so \ + -o /usr/lib/libvoidstar.so \ + && ldconfig + # Stage 3: Build a lightweight CI Builder image for console/playwright jobs. FROM ubuntu:noble-20260324 AS ci-builder-console diff --git a/misc/python/materialize/mzbuild.py b/misc/python/materialize/mzbuild.py index f653b84abc4a9..2200188139219 100644 --- a/misc/python/materialize/mzbuild.py +++ b/misc/python/materialize/mzbuild.py @@ -187,6 +187,7 @@ def __init__( sanitizer: Sanitizer, image_registry: str, image_prefix: str, + antithesis: bool = False, ): self.root = root self.arch = arch @@ -196,6 +197,7 @@ def __init__( self.cargo_workspace = cargo.Workspace(root) self.image_registry = image_registry self.image_prefix = image_prefix + self.antithesis = antithesis def build( self, @@ -513,6 +515,8 @@ def extra(self) -> str: flags += "optimized" if self.rd.coverage: flags += "coverage" + if self.rd.antithesis: + flags += ["antithesis"] if self.rd.sanitizer != Sanitizer.none: flags += self.rd.sanitizer.value flags.sort() @@ -547,15 +551,14 @@ def generate_cargo_build_command( examples: list[str], features: list[str] | None = None, ) -> list[str]: - rustflags = ( - rustc_flags.coverage - if rd.coverage - else ( - rustc_flags.sanitizer[rd.sanitizer] - if rd.sanitizer != Sanitizer.none - else ["--cfg=tokio_unstable"] - ) - ) + if rd.antithesis: + rustflags = rustc_flags.antithesis + elif rd.coverage: + rustflags = rustc_flags.coverage + elif rd.sanitizer != Sanitizer.none: + rustflags = rustc_flags.sanitizer[rd.sanitizer] + else: + rustflags = ["--cfg=tokio_unstable"] cflags = ( [ f"--target={target(rd.arch)}", @@ -568,8 +571,8 @@ def generate_cargo_build_command( if rd.sanitizer != Sanitizer.none else [] ) - extra_env = ( - { + if rd.sanitizer != Sanitizer.none: + extra_env = { "CFLAGS": " ".join(cflags), "CXXFLAGS": " ".join(cflags), "LDFLAGS": " ".join(cflags), @@ -582,9 +585,8 @@ def generate_cargo_build_command( "PATH": f"/sanshim:/opt/x-tools/{target(rd.arch)}/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", "TSAN_OPTIONS": "report_bugs=0", # build-scripts fail } - if rd.sanitizer != Sanitizer.none - else {} - ) + else: + extra_env = {} cargo_build = rd.build( "build", channel=None, rustflags=rustflags, extra_env=extra_env @@ -672,7 +674,11 @@ def copy(src: Path, relative_dst: Path) -> None: exe_path.parent.mkdir(parents=True, exist_ok=True) shutil.copy(src, exe_path) - if self.strip: + if self.rd.antithesis: + # Antithesis needs full debug symbols for symbolization. + # Don't strip anything. + pass + elif self.strip: # The debug information is large enough that it slows down CI, # since we're packaging these binaries up into Docker images and # shipping them around. @@ -945,6 +951,7 @@ def _build_locked( "ARCH_GCC": str(self.image.rd.arch), "ARCH_GO": self.image.rd.arch.go_str(), "CI_SANITIZER": str(self.image.rd.sanitizer), + "ANTITHESIS": "1" if self.image.rd.antithesis else "", } f = self.write_dockerfile() @@ -1416,6 +1423,7 @@ def __init__( sanitizer: Sanitizer = Sanitizer.none, image_registry: str = image_registry(), image_prefix: str = "", + antithesis: bool = False, ): self.rd = RepositoryDetails( root, @@ -1425,6 +1433,7 @@ def __init__( sanitizer, image_registry, image_prefix, + antithesis=antithesis, ) self.images: dict[str, Image] = {} self.compositions: dict[str, Path] = {} @@ -1517,6 +1526,12 @@ def install_arguments(parser: argparse.ArgumentParser) -> None: default="", help="a prefix to apply to all Docker image names", ) + parser.add_argument( + "--antithesis", + help="whether to enable Antithesis coverage instrumentation", + default=ui.env_is_truthy("CI_ANTITHESIS"), + action="store_true", + ) @classmethod def from_arguments(cls, root: Path, args: argparse.Namespace) -> "Repository": @@ -1544,6 +1559,7 @@ def from_arguments(cls, root: Path, args: argparse.Namespace) -> "Repository": image_registry=args.image_registry, image_prefix=args.image_prefix, arch=args.arch, + antithesis=args.antithesis, ) @property diff --git a/misc/python/materialize/rustc_flags.py b/misc/python/materialize/rustc_flags.py index 6353f83d3b68a..f6aac45573e14 100644 --- a/misc/python/materialize/rustc_flags.py +++ b/misc/python/materialize/rustc_flags.py @@ -25,6 +25,20 @@ ] +# Flags to enable Antithesis coverage instrumentation. +# Requires libvoidstar.so at /usr/lib/ (installed in ci-builder and +# the materialized Docker image). +# See: https://antithesis.com/docs/using_antithesis/sdk/rust/instrumentation/ +antithesis = [ + "-Ccodegen-units=1", + "-Cpasses=sancov-module", + "-Cllvm-args=-sanitizer-coverage-level=3", + "-Cllvm-args=-sanitizer-coverage-trace-pc-guard", + "-Clink-args=-Wl,--build-id", + "-lvoidstar", +] + + class Sanitizer(Enum): """What sanitizer to use""" diff --git a/src/materialized/ci/Dockerfile b/src/materialized/ci/Dockerfile index 18686251a7b07..e06aaf6bad0cf 100644 --- a/src/materialized/ci/Dockerfile +++ b/src/materialized/ci/Dockerfile @@ -20,6 +20,17 @@ COPY materialized entrypoint.sh /usr/local/bin/ USER root RUN ln -s /usr/local/bin/materialized /usr/local/bin/environmentd \ && ln -s /usr/local/bin/materialized /usr/local/bin/clusterd + +# Antithesis instrumentation (conditional on --build-arg ANTITHESIS=1) +ARG ANTITHESIS +RUN if [ -n "$ANTITHESIS" ]; then \ + curl -sSL https://antithesis.com/assets/instrumentation/libvoidstar.so \ + -o /usr/lib/libvoidstar.so \ + && ldconfig \ + && mkdir -p /symbols \ + && ln -s /usr/local/bin/materialized /symbols/materialized; \ + fi + USER materialize ENTRYPOINT ["tini", "--", "entrypoint.sh"] diff --git a/test/antithesis/export-compose.py b/test/antithesis/export-compose.py index 5b487a5485bc2..6d7e463d564a2 100644 --- a/test/antithesis/export-compose.py +++ b/test/antithesis/export-compose.py @@ -22,7 +22,7 @@ # Map mzbuild names → image references for the Antithesis compose. # Public images for infra; local build tag for the workload. MZBUILD_TO_IMAGE = { - "materialized": "materialize/materialized:latest", + "materialized": "materialize-materialized:latest", "postgres": "postgres:17.7", "minio": "minio/minio:latest", "antithesis-workload": "materialize-workload:latest", From 2d9ae67499ff0f1e8842467803e71dcd9d513812 Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Mon, 11 May 2026 13:48:21 -0400 Subject: [PATCH 04/65] test/antithesis: consolidate antithesis/ into test/antithesis/ --- antithesis/AGENTS.md | 15 ------------- test/antithesis/AGENTS.md | 21 +++++++++++++++++++ {antithesis => test/antithesis}/Makefile | 6 +++--- .../antithesis}/config/Dockerfile | 0 .../antithesis}/config/docker-compose.yaml | 0 .../antithesis}/scratchbook/bug-candidates.md | 0 .../scratchbook/deployment-topology.md | 0 .../scratchbook/existing-assertions.md | 0 .../catalog-recovery-consistency.md | 0 .../properties/command-channel-ordering.md | 0 .../compute-replica-epoch-isolation.md | 0 .../critical-reader-fence-linearization.md | 0 .../properties/deployment-lag-detection.md | 0 .../properties/deployment-promotion-safety.md | 0 .../epoch-fencing-prevents-split-brain.md | 0 .../properties/fault-recovery-exercised.md | 0 .../properties/group-commit-toctou-safety.md | 0 .../idempotent-write-under-indeterminate.md | 0 .../properties/mv-reflects-source-updates.md | 0 .../properties/peek-lifecycle-exactly-once.md | 0 .../properties/persist-cas-monotonicity.md | 0 .../properties/source-ingestion-progress.md | 0 .../storage-command-replay-idempotent.md | 0 .../properties/strict-serializable-reads.md | 0 .../properties/tombstone-sealing-finality.md | 0 .../scratchbook/property-catalog.md | 0 .../scratchbook/property-relationships.md | 0 .../antithesis}/scratchbook/sut-analysis.md | 0 28 files changed, 24 insertions(+), 18 deletions(-) delete mode 100644 antithesis/AGENTS.md create mode 100644 test/antithesis/AGENTS.md rename {antithesis => test/antithesis}/Makefile (96%) rename {antithesis => test/antithesis}/config/Dockerfile (100%) rename {antithesis => test/antithesis}/config/docker-compose.yaml (100%) rename {antithesis => test/antithesis}/scratchbook/bug-candidates.md (100%) rename {antithesis => test/antithesis}/scratchbook/deployment-topology.md (100%) rename {antithesis => test/antithesis}/scratchbook/existing-assertions.md (100%) rename {antithesis => test/antithesis}/scratchbook/properties/catalog-recovery-consistency.md (100%) rename {antithesis => test/antithesis}/scratchbook/properties/command-channel-ordering.md (100%) rename {antithesis => test/antithesis}/scratchbook/properties/compute-replica-epoch-isolation.md (100%) rename {antithesis => test/antithesis}/scratchbook/properties/critical-reader-fence-linearization.md (100%) rename {antithesis => test/antithesis}/scratchbook/properties/deployment-lag-detection.md (100%) rename {antithesis => test/antithesis}/scratchbook/properties/deployment-promotion-safety.md (100%) rename {antithesis => test/antithesis}/scratchbook/properties/epoch-fencing-prevents-split-brain.md (100%) rename {antithesis => test/antithesis}/scratchbook/properties/fault-recovery-exercised.md (100%) rename {antithesis => test/antithesis}/scratchbook/properties/group-commit-toctou-safety.md (100%) rename {antithesis => test/antithesis}/scratchbook/properties/idempotent-write-under-indeterminate.md (100%) rename {antithesis => test/antithesis}/scratchbook/properties/mv-reflects-source-updates.md (100%) rename {antithesis => test/antithesis}/scratchbook/properties/peek-lifecycle-exactly-once.md (100%) rename {antithesis => test/antithesis}/scratchbook/properties/persist-cas-monotonicity.md (100%) rename {antithesis => test/antithesis}/scratchbook/properties/source-ingestion-progress.md (100%) rename {antithesis => test/antithesis}/scratchbook/properties/storage-command-replay-idempotent.md (100%) rename {antithesis => test/antithesis}/scratchbook/properties/strict-serializable-reads.md (100%) rename {antithesis => test/antithesis}/scratchbook/properties/tombstone-sealing-finality.md (100%) rename {antithesis => test/antithesis}/scratchbook/property-catalog.md (100%) rename {antithesis => test/antithesis}/scratchbook/property-relationships.md (100%) rename {antithesis => test/antithesis}/scratchbook/sut-analysis.md (100%) diff --git a/antithesis/AGENTS.md b/antithesis/AGENTS.md deleted file mode 100644 index ff80e8994fb67..0000000000000 --- a/antithesis/AGENTS.md +++ /dev/null @@ -1,15 +0,0 @@ -This directory contains files relevant to running tests in Antithesis. - -Use the `antithesis-setup` skill to scaffold and manage this directory. Use the `antithesis-research` skill to analyze the system and build a property catalog. Use the `antithesis-workload` skill to implement assertions and test commands. - -**setup-complete.sh** -Inject this script into a Dockerfile to notify Antithesis that setup is complete. This script should only run once the system under test is ready for testing. Antithesis will not run any test commands until it receives this event. - -**config** -This directory contains the `docker-compose.yaml` file used to bring up this system within the Antithesis environment, along with any closely related config files. - -**scratchbook** -This directory is the Antithesis scratchbook for the codebase. It contains documents such as system analysis, property catalogs, topology plans, per-property evidence files (in `scratchbook/properties/`), property relationship maps, and other persistent integration notes. Keep it up to date as Antithesis-related decisions change. - -**test** -This directory contains test templates. A test template is a directory containing test command executable files. Each test command must have a valid prefix: `parallel_driver_, singleton_driver_, serial_driver_, first_, eventually_, finally_, anytime_`. Prefixes constrain when and how commands are composed in a single timeline. Files or subdirectories prefixed with `helper_` are ignored by Test Composer and can be used for helper scripts kept alongside the commands. diff --git a/test/antithesis/AGENTS.md b/test/antithesis/AGENTS.md new file mode 100644 index 0000000000000..b93956df1ea94 --- /dev/null +++ b/test/antithesis/AGENTS.md @@ -0,0 +1,21 @@ +Files relevant to running Materialize under Antithesis. + +Use the `antithesis-setup` skill to scaffold and manage this directory. Use the `antithesis-research` skill to analyze the system and build a property catalog. Use the `antithesis-workload` skill to implement assertions and test commands. + +**mzcompose.py** +Source of truth for the Antithesis topology. Standard mzcompose composition: services (`postgres-metadata`, `minio`, `redpanda`, `materialized`, `workload`), dependencies, env, ports. The generated `config/docker-compose.yaml` is derived from this. + +**export-compose.py** +Renders `mzcompose.py` into a flat docker-compose YAML that Antithesis can consume. Images are emitted as `ghcr.io/materializeinc/materialize/:mzbuild-` refs that Antithesis pulls directly from public GHCR. + +**workload/** +Mzbuild image (`antithesis-workload`) for the Python test driver. Dockerfile, entrypoint, and test-template scripts (`test/*.sh`) live here. Test command files must be prefixed with one of `parallel_driver_`, `singleton_driver_`, `serial_driver_`, `first_`, `eventually_`, `finally_`, `anytime_`; files prefixed with `helper_` are ignored by Test Composer. + +**config/** +Mzbuild image (`antithesis-config`) — a `FROM scratch` container holding the generated `docker-compose.yaml`. This is the image Antithesis points at to bring up the environment. + +**scratchbook/** +Antithesis scratchbook: system analysis, property catalog, topology plans, per-property evidence files (in `scratchbook/properties/`), property relationship maps, persistent integration notes. Keep up to date as Antithesis-related decisions change. + +**setup-complete.sh** (in `workload/`) +Inject this script into a Dockerfile to notify Antithesis that setup is complete. Should only run once the system under test is ready for testing — Antithesis will not run test commands until it receives this event. diff --git a/antithesis/Makefile b/test/antithesis/Makefile similarity index 96% rename from antithesis/Makefile rename to test/antithesis/Makefile index 25bf6408cf927..bee47a5d6f2c6 100644 --- a/antithesis/Makefile +++ b/test/antithesis/Makefile @@ -13,7 +13,7 @@ SHELL := /usr/bin/env bash .SHELLFLAGS := -eu -o pipefail -c PROJECT := materialize -REPO_ROOT := $(realpath $(dir $(lastword $(MAKEFILE_LIST)))/..) +REPO_ROOT := $(realpath $(dir $(lastword $(MAKEFILE_LIST)))/../..) ifndef RUNTIME RUNTIME := $(shell command -v podman >/dev/null 2>&1 && echo podman || (command -v docker >/dev/null 2>&1 && echo docker || echo none)) @@ -36,8 +36,8 @@ REGISTRY_PATH ?= /molten-verve-216720/materialize-repository # --------------------------------------------------------------------------- .PHONY: export-compose export-compose: - cd $(REPO_ROOT) && $(REPO_ROOT)/bin/pyactivate test/antithesis/export-compose.py > antithesis/config/docker-compose.yaml - @echo "Wrote config/docker-compose.yaml" + cd $(REPO_ROOT) && $(REPO_ROOT)/bin/pyactivate test/antithesis/export-compose.py > test/antithesis/config/docker-compose.yaml + @echo "Wrote test/antithesis/config/docker-compose.yaml" # --------------------------------------------------------------------------- # Build — build images that don't have public equivalents. diff --git a/antithesis/config/Dockerfile b/test/antithesis/config/Dockerfile similarity index 100% rename from antithesis/config/Dockerfile rename to test/antithesis/config/Dockerfile diff --git a/antithesis/config/docker-compose.yaml b/test/antithesis/config/docker-compose.yaml similarity index 100% rename from antithesis/config/docker-compose.yaml rename to test/antithesis/config/docker-compose.yaml diff --git a/antithesis/scratchbook/bug-candidates.md b/test/antithesis/scratchbook/bug-candidates.md similarity index 100% rename from antithesis/scratchbook/bug-candidates.md rename to test/antithesis/scratchbook/bug-candidates.md diff --git a/antithesis/scratchbook/deployment-topology.md b/test/antithesis/scratchbook/deployment-topology.md similarity index 100% rename from antithesis/scratchbook/deployment-topology.md rename to test/antithesis/scratchbook/deployment-topology.md diff --git a/antithesis/scratchbook/existing-assertions.md b/test/antithesis/scratchbook/existing-assertions.md similarity index 100% rename from antithesis/scratchbook/existing-assertions.md rename to test/antithesis/scratchbook/existing-assertions.md diff --git a/antithesis/scratchbook/properties/catalog-recovery-consistency.md b/test/antithesis/scratchbook/properties/catalog-recovery-consistency.md similarity index 100% rename from antithesis/scratchbook/properties/catalog-recovery-consistency.md rename to test/antithesis/scratchbook/properties/catalog-recovery-consistency.md diff --git a/antithesis/scratchbook/properties/command-channel-ordering.md b/test/antithesis/scratchbook/properties/command-channel-ordering.md similarity index 100% rename from antithesis/scratchbook/properties/command-channel-ordering.md rename to test/antithesis/scratchbook/properties/command-channel-ordering.md diff --git a/antithesis/scratchbook/properties/compute-replica-epoch-isolation.md b/test/antithesis/scratchbook/properties/compute-replica-epoch-isolation.md similarity index 100% rename from antithesis/scratchbook/properties/compute-replica-epoch-isolation.md rename to test/antithesis/scratchbook/properties/compute-replica-epoch-isolation.md diff --git a/antithesis/scratchbook/properties/critical-reader-fence-linearization.md b/test/antithesis/scratchbook/properties/critical-reader-fence-linearization.md similarity index 100% rename from antithesis/scratchbook/properties/critical-reader-fence-linearization.md rename to test/antithesis/scratchbook/properties/critical-reader-fence-linearization.md diff --git a/antithesis/scratchbook/properties/deployment-lag-detection.md b/test/antithesis/scratchbook/properties/deployment-lag-detection.md similarity index 100% rename from antithesis/scratchbook/properties/deployment-lag-detection.md rename to test/antithesis/scratchbook/properties/deployment-lag-detection.md diff --git a/antithesis/scratchbook/properties/deployment-promotion-safety.md b/test/antithesis/scratchbook/properties/deployment-promotion-safety.md similarity index 100% rename from antithesis/scratchbook/properties/deployment-promotion-safety.md rename to test/antithesis/scratchbook/properties/deployment-promotion-safety.md diff --git a/antithesis/scratchbook/properties/epoch-fencing-prevents-split-brain.md b/test/antithesis/scratchbook/properties/epoch-fencing-prevents-split-brain.md similarity index 100% rename from antithesis/scratchbook/properties/epoch-fencing-prevents-split-brain.md rename to test/antithesis/scratchbook/properties/epoch-fencing-prevents-split-brain.md diff --git a/antithesis/scratchbook/properties/fault-recovery-exercised.md b/test/antithesis/scratchbook/properties/fault-recovery-exercised.md similarity index 100% rename from antithesis/scratchbook/properties/fault-recovery-exercised.md rename to test/antithesis/scratchbook/properties/fault-recovery-exercised.md diff --git a/antithesis/scratchbook/properties/group-commit-toctou-safety.md b/test/antithesis/scratchbook/properties/group-commit-toctou-safety.md similarity index 100% rename from antithesis/scratchbook/properties/group-commit-toctou-safety.md rename to test/antithesis/scratchbook/properties/group-commit-toctou-safety.md diff --git a/antithesis/scratchbook/properties/idempotent-write-under-indeterminate.md b/test/antithesis/scratchbook/properties/idempotent-write-under-indeterminate.md similarity index 100% rename from antithesis/scratchbook/properties/idempotent-write-under-indeterminate.md rename to test/antithesis/scratchbook/properties/idempotent-write-under-indeterminate.md diff --git a/antithesis/scratchbook/properties/mv-reflects-source-updates.md b/test/antithesis/scratchbook/properties/mv-reflects-source-updates.md similarity index 100% rename from antithesis/scratchbook/properties/mv-reflects-source-updates.md rename to test/antithesis/scratchbook/properties/mv-reflects-source-updates.md diff --git a/antithesis/scratchbook/properties/peek-lifecycle-exactly-once.md b/test/antithesis/scratchbook/properties/peek-lifecycle-exactly-once.md similarity index 100% rename from antithesis/scratchbook/properties/peek-lifecycle-exactly-once.md rename to test/antithesis/scratchbook/properties/peek-lifecycle-exactly-once.md diff --git a/antithesis/scratchbook/properties/persist-cas-monotonicity.md b/test/antithesis/scratchbook/properties/persist-cas-monotonicity.md similarity index 100% rename from antithesis/scratchbook/properties/persist-cas-monotonicity.md rename to test/antithesis/scratchbook/properties/persist-cas-monotonicity.md diff --git a/antithesis/scratchbook/properties/source-ingestion-progress.md b/test/antithesis/scratchbook/properties/source-ingestion-progress.md similarity index 100% rename from antithesis/scratchbook/properties/source-ingestion-progress.md rename to test/antithesis/scratchbook/properties/source-ingestion-progress.md diff --git a/antithesis/scratchbook/properties/storage-command-replay-idempotent.md b/test/antithesis/scratchbook/properties/storage-command-replay-idempotent.md similarity index 100% rename from antithesis/scratchbook/properties/storage-command-replay-idempotent.md rename to test/antithesis/scratchbook/properties/storage-command-replay-idempotent.md diff --git a/antithesis/scratchbook/properties/strict-serializable-reads.md b/test/antithesis/scratchbook/properties/strict-serializable-reads.md similarity index 100% rename from antithesis/scratchbook/properties/strict-serializable-reads.md rename to test/antithesis/scratchbook/properties/strict-serializable-reads.md diff --git a/antithesis/scratchbook/properties/tombstone-sealing-finality.md b/test/antithesis/scratchbook/properties/tombstone-sealing-finality.md similarity index 100% rename from antithesis/scratchbook/properties/tombstone-sealing-finality.md rename to test/antithesis/scratchbook/properties/tombstone-sealing-finality.md diff --git a/antithesis/scratchbook/property-catalog.md b/test/antithesis/scratchbook/property-catalog.md similarity index 100% rename from antithesis/scratchbook/property-catalog.md rename to test/antithesis/scratchbook/property-catalog.md diff --git a/antithesis/scratchbook/property-relationships.md b/test/antithesis/scratchbook/property-relationships.md similarity index 100% rename from antithesis/scratchbook/property-relationships.md rename to test/antithesis/scratchbook/property-relationships.md diff --git a/antithesis/scratchbook/sut-analysis.md b/test/antithesis/scratchbook/sut-analysis.md similarity index 100% rename from antithesis/scratchbook/sut-analysis.md rename to test/antithesis/scratchbook/sut-analysis.md From 40100e8ee92c80a586531075c8cdca73437e9c9a Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Mon, 11 May 2026 13:49:53 -0400 Subject: [PATCH 05/65] test/antithesis: add antithesis-config mzbuild image (FROM scratch + compose YAML) --- test/antithesis/config/Dockerfile | 13 +++++++++++++ test/antithesis/config/mzbuild.yml | 19 +++++++++++++++++++ 2 files changed, 32 insertions(+) create mode 100644 test/antithesis/config/mzbuild.yml diff --git a/test/antithesis/config/Dockerfile b/test/antithesis/config/Dockerfile index fb59d4a2bd588..386049db7e8e5 100644 --- a/test/antithesis/config/Dockerfile +++ b/test/antithesis/config/Dockerfile @@ -1,2 +1,15 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +# Antithesis config image: a FROM-scratch tarball holding the resolved +# docker-compose.yaml that Antithesis uses to bring up the system under +# test. See mzbuild.yml for regeneration instructions. + FROM scratch COPY docker-compose.yaml / diff --git a/test/antithesis/config/mzbuild.yml b/test/antithesis/config/mzbuild.yml new file mode 100644 index 0000000000000..899d620d1285f --- /dev/null +++ b/test/antithesis/config/mzbuild.yml @@ -0,0 +1,19 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +# FROM-scratch image holding the resolved docker-compose.yaml for the +# Antithesis environment. Antithesis pulls this image and reads the compose +# spec from `/docker-compose.yaml` to bring up the system under test. +# +# The compose file is generated from test/antithesis/mzcompose.py via +# `bin/pyactivate test/antithesis/export-compose.py`. Re-run that whenever +# the composition topology changes; CI verifies the committed copy is up to +# date. + +name: antithesis-config From 92569ac49eb631642d59301b676685cc000ff9c8 Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Mon, 11 May 2026 13:52:00 -0400 Subject: [PATCH 06/65] test/antithesis: add copyright headers --- test/antithesis/config/docker-compose.yaml | 13 +++++++++++++ test/antithesis/export-compose.py | 10 ++++++++++ test/antithesis/workload/Dockerfile | 9 +++++++++ test/antithesis/workload/mzbuild.yml | 9 +++++++++ test/antithesis/workload/setup-complete.sh | 10 ++++++++++ .../workload/test/anytime_health_check.sh | 10 ++++++++++ test/antithesis/workload/workload-entrypoint.sh | 10 ++++++++++ 7 files changed, 71 insertions(+) diff --git a/test/antithesis/config/docker-compose.yaml b/test/antithesis/config/docker-compose.yaml index 004fc60b245d9..f1f359ad7dfef 100644 --- a/test/antithesis/config/docker-compose.yaml +++ b/test/antithesis/config/docker-compose.yaml @@ -1,3 +1,16 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +# GENERATED FILE — do not edit. Regenerate via: +# bin/pyactivate test/antithesis/export-compose.py > test/antithesis/config/docker-compose.yaml +# Source of truth: test/antithesis/mzcompose.py. + services: postgres-metadata: command: diff --git a/test/antithesis/export-compose.py b/test/antithesis/export-compose.py index 6d7e463d564a2..80b20b2d9e9f6 100644 --- a/test/antithesis/export-compose.py +++ b/test/antithesis/export-compose.py @@ -1,4 +1,14 @@ #!/usr/bin/env python3 + +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + """Export the resolved docker-compose YAML for the Antithesis composition. Loads the mzcompose composition and dumps the compose dict to stdout as diff --git a/test/antithesis/workload/Dockerfile b/test/antithesis/workload/Dockerfile index 804cb1b3009ec..b72a6b541d818 100644 --- a/test/antithesis/workload/Dockerfile +++ b/test/antithesis/workload/Dockerfile @@ -1,3 +1,12 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + # Antithesis workload client for Materialize. # # Python-based test driver that connects to materialized via pgwire, diff --git a/test/antithesis/workload/mzbuild.yml b/test/antithesis/workload/mzbuild.yml index beed6bf84e93b..f62b4c073bb00 100644 --- a/test/antithesis/workload/mzbuild.yml +++ b/test/antithesis/workload/mzbuild.yml @@ -1 +1,10 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + name: antithesis-workload diff --git a/test/antithesis/workload/setup-complete.sh b/test/antithesis/workload/setup-complete.sh index 59384ae9ba2b4..ecae58fa23e44 100755 --- a/test/antithesis/workload/setup-complete.sh +++ b/test/antithesis/workload/setup-complete.sh @@ -1,4 +1,14 @@ #!/usr/bin/env bash + +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + set -euo pipefail # Run this script to inform Antithesis that it can start running Test Composer diff --git a/test/antithesis/workload/test/anytime_health_check.sh b/test/antithesis/workload/test/anytime_health_check.sh index f7d743ebc4cd7..641aed971be93 100755 --- a/test/antithesis/workload/test/anytime_health_check.sh +++ b/test/antithesis/workload/test/anytime_health_check.sh @@ -1,4 +1,14 @@ #!/usr/bin/env bash + +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + set -euo pipefail # Basic health check — verifies materialized is responding to SQL. diff --git a/test/antithesis/workload/workload-entrypoint.sh b/test/antithesis/workload/workload-entrypoint.sh index 0f5b012c3ad9e..f37eb275ab1e7 100755 --- a/test/antithesis/workload/workload-entrypoint.sh +++ b/test/antithesis/workload/workload-entrypoint.sh @@ -1,4 +1,14 @@ #!/usr/bin/env bash + +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + set -euo pipefail # Wait for materialized to be ready before signaling setup_complete. From 0a7d801d5710ec6c174f685229e150748e38474a Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Mon, 11 May 2026 13:55:17 -0400 Subject: [PATCH 07/65] test/antithesis: rewrite export-compose.py to use mzbuild specs --- test/antithesis/config/docker-compose.yaml | 36 ++- test/antithesis/export-compose.py | 255 ++++++++++++++------- test/antithesis/mzcompose.py | 20 +- 3 files changed, 203 insertions(+), 108 deletions(-) diff --git a/test/antithesis/config/docker-compose.yaml b/test/antithesis/config/docker-compose.yaml index f1f359ad7dfef..dc75d1e5f2ef8 100644 --- a/test/antithesis/config/docker-compose.yaml +++ b/test/antithesis/config/docker-compose.yaml @@ -44,12 +44,27 @@ services: entrypoint: - sh - -c - - 'echo "CREATE ROLE root WITH LOGIN PASSWORD ''root'';CREATE DATABASE root;GRANT - ALL PRIVILEGES ON DATABASE root TO root;\c root;CREATE SCHEMA IF NOT EXISTS - consensus AUTHORIZATION root;CREATE SCHEMA IF NOT EXISTS adapter AUTHORIZATION - root;CREATE SCHEMA IF NOT EXISTS storage AUTHORIZATION root;CREATE SCHEMA IF - NOT EXISTS tsoracle AUTHORIZATION root;GRANT ALL PRIVILEGES ON SCHEMA public - TO root;" > /docker-entrypoint-initdb.d/z_setup_materialize.sql + - 'cat <<''SQL'' > /docker-entrypoint-initdb.d/z_setup_materialize.sql + + CREATE ROLE root WITH LOGIN PASSWORD ''root''; + + CREATE DATABASE root; + + GRANT ALL PRIVILEGES ON DATABASE root TO root; + + \c root + + CREATE SCHEMA IF NOT EXISTS consensus AUTHORIZATION root; + + CREATE SCHEMA IF NOT EXISTS adapter AUTHORIZATION root; + + CREATE SCHEMA IF NOT EXISTS storage AUTHORIZATION root; + + CREATE SCHEMA IF NOT EXISTS tsoracle AUTHORIZATION root; + + GRANT ALL PRIVILEGES ON SCHEMA public TO root; + + SQL exec docker-entrypoint.sh "$$@"' - -- @@ -142,7 +157,6 @@ services: environment: - MZ_NO_TELEMETRY=1 - MZ_NO_BUILTIN_CONSOLE=1 - - MZ_EAT_MY_DATA=1 - MZ_TEST_ONLY_DUMMY_SEGMENT_CLIENT=true - MZ_SOFT_ASSERTIONS=1 - MZ_ORCHESTRATOR_PROCESS_TCP_PROXY_LISTEN_ADDR=0.0.0.0 @@ -153,8 +167,6 @@ services: - MZ_AWS_CONNECTION_ROLE_ARN=arn:aws:iam::123456789000:role/MaterializeConnection - MZ_AWS_EXTERNAL_ID_PREFIX=eb5cb59b-e2fe-41f3-87ca-d2176a495345 - MZ_CATALOG_STORE=persist - - MZ_LOG_FILTER - - CLUSTERD_LOG_FILTER - 'MZ_CLUSTER_REPLICA_SIZES={"bootstrap": {"cpu_exclusive": false, "cpu_limit": null, "credits_per_hour": "1", "disabled": false, "disk_limit": null, "is_cc": true, "memory_limit": "4 GiB", "scale": 1, "workers": 1}, "scale=2,workers=4": @@ -284,7 +296,7 @@ services: - MZ_BOOTSTRAP_DEFAULT_CLUSTER_REPLICATION_FACTOR=1 - COCKROACH_ENGINE_MAX_SYNC_DURATION_DEFAULT=120s - COCKROACH_LOG_MAX_SYNC_DURATION=120s - - MZ_SYSTEM_PARAMETER_DEFAULT=unsafe_enable_unsafe_functions=true;allow_real_time_recency=true;constraint_based_timestamp_selection=verify;enable_compute_peek_response_stash=true;enable_0dt_deployment_panic_after_timeout=true;enable_0dt_deployment_sources=true;enable_alter_swap=true;enable_case_literal_transform=false;enable_cast_elimination=true;enable_coalesce_case_transform=true;enable_columnar_lgalloc=false;enable_columnation_lgalloc=false;enable_compute_correction_v2=true;enable_compute_logical_backpressure=true;enable_connection_validation_syntax=true;enable_copy_to_expr=true;enable_copy_from_remote=true;enable_create_table_from_source=true;enable_eager_delta_joins=true;enable_envelope_debezium_in_subscribe=true;enable_expressions_in_limit_syntax=true;enable_iceberg_sink=true;enable_introspection_subscribes=true;enable_kafka_sink_partition_by=true;enable_lgalloc=false;enable_load_generator_counter=true;enable_logical_compaction_window=true;enable_multi_worker_storage_persist_sink=true;enable_multi_replica_sources=true;enable_rbac_checks=true;enable_reduce_mfp_fusion=true;enable_refresh_every_mvs=true;enable_replacement_materialized_views=true;enable_cluster_schedule_refresh=true;enable_sql_server_source=true;enable_s3_tables_region_check=false;enable_statement_lifecycle_logging=true;enable_storage_introspection_logs=true;enable_compute_temporal_bucketing=true;enable_variadic_left_join_lowering=true;enable_worker_core_affinity=true;grpc_client_http2_keep_alive_timeout=5s;ore_overflowing_behavior=panic;unsafe_enable_table_keys=true;with_0dt_deployment_max_wait=1800s;persist_next_listen_batch_retryer_clamp=16s;persist_next_listen_batch_retryer_initial_backoff=100ms;persist_next_listen_batch_retryer_fixed_sleep=1200ms;persist_enable_arrow_lgalloc_noncc_sizes=true;persist_enable_s3_lgalloc_noncc_sizes=true;compute_correction_v2_chain_proportionality=3;compute_correction_v2_chunk_size=8192;compute_dataflow_max_inflight_bytes=134217728;compute_hydration_concurrency=2;compute_replica_expiration_offset=3d;compute_apply_column_demands=true;compute_peek_response_stash_threshold_bytes=1048576;compute_subscribe_snapshot_optimization=true;enable_compute_sync_mv_sink=true;enable_password_auth=true;enable_frontend_peek_sequencing=true;enable_frontend_subscribes=true;enable_upsert_v2=false;default_timestamp_interval=1s;force_source_table_syntax=false;persist_batch_columnar_format=structured;persist_batch_delete_enabled=true;persist_batch_structured_order=true;persist_batch_builder_structured=true;persist_batch_structured_key_lower_len=256;persist_batch_max_run_len=4;persist_catalog_force_compaction_fuel=1024;persist_catalog_force_compaction_wait=1s;persist_stats_audit_percent=100;persist_stats_audit_panic=true;persist_encoding_enable_dictionary=true;persist_fast_path_limit=1000;persist_fast_path_order=true;persist_gc_use_active_gc=true;persist_gc_min_versions=16;persist_gc_max_versions=128000;persist_inline_writes_single_max_bytes=4096;persist_inline_writes_total_max_bytes=1048576;persist_pubsub_client_enabled=true;persist_pubsub_push_diff_enabled=true;persist_record_compactions=true;persist_record_schema_id=true;persist_rollup_use_active_rollup=true;persist_blob_target_size=16777216;persist_compaction_memory_bound_bytes=83886080;persist_enable_incremental_compaction=true;persist_use_critical_since_catalog=true;persist_use_critical_since_snapshot=false;persist_use_critical_since_source=false;persist_part_decode_format=arrow;persist_blob_cache_scale_with_threads=true;persist_state_update_lease_timeout=1s;persist_validate_part_bounds_on_read=false;persist_validate_part_bounds_on_write=false;statement_logging_default_sample_rate=1.0;statement_logging_max_data_credit=;statement_logging_max_sample_rate=1.0;statement_logging_target_data_rate=;storage_reclock_to_latest=true;storage_source_decode_fuel=100000;storage_statistics_collection_interval=1000;storage_statistics_interval=2000;storage_use_continual_feedback_upsert=true;default_cluster_replication_factor=1 + - MZ_SYSTEM_PARAMETER_DEFAULT=unsafe_enable_unsafe_functions=true;allow_real_time_recency=true;constraint_based_timestamp_selection=verify;enable_compute_peek_response_stash=true;enable_0dt_deployment_panic_after_timeout=true;enable_0dt_deployment_sources=true;enable_alter_swap=true;enable_case_literal_transform=false;enable_cast_elimination=true;enable_coalesce_case_transform=true;enable_columnar_lgalloc=false;enable_columnation_lgalloc=false;enable_compute_correction_v2=true;enable_compute_logical_backpressure=true;enable_connection_validation_syntax=true;enable_copy_to_expr=true;enable_copy_from_remote=true;enable_create_table_from_source=true;enable_eager_delta_joins=true;enable_envelope_debezium_in_subscribe=true;enable_expressions_in_limit_syntax=true;enable_iceberg_sink=true;enable_introspection_subscribes=true;enable_kafka_sink_partition_by=true;enable_lgalloc=false;enable_load_generator_counter=true;enable_logical_compaction_window=true;enable_multi_worker_storage_persist_sink=true;enable_multi_replica_sources=true;enable_rbac_checks=true;enable_reduce_mfp_fusion=true;enable_refresh_every_mvs=true;enable_replacement_materialized_views=true;enable_cluster_schedule_refresh=true;enable_sql_server_source=true;enable_s3_tables_region_check=false;enable_statement_lifecycle_logging=true;enable_storage_introspection_logs=true;enable_compute_temporal_bucketing=true;enable_variadic_left_join_lowering=true;enable_worker_core_affinity=true;grpc_client_http2_keep_alive_timeout=5s;ore_overflowing_behavior=panic;unsafe_enable_table_keys=true;with_0dt_deployment_max_wait=1800s;persist_next_listen_batch_retryer_clamp=16s;persist_next_listen_batch_retryer_initial_backoff=100ms;persist_next_listen_batch_retryer_fixed_sleep=1200ms;persist_enable_arrow_lgalloc_noncc_sizes=true;persist_enable_s3_lgalloc_noncc_sizes=true;compute_correction_v2_chain_proportionality=3;compute_correction_v2_chunk_size=8192;compute_dataflow_max_inflight_bytes=134217728;compute_hydration_concurrency=2;compute_replica_expiration_offset=3d;compute_apply_column_demands=true;compute_peek_response_stash_threshold_bytes=1048576;compute_subscribe_snapshot_optimization=true;enable_compute_sync_mv_sink=true;enable_password_auth=true;enable_frontend_peek_sequencing=true;enable_frontend_subscribes=true;enable_upsert_v2=false;default_timestamp_interval=1s;force_source_table_syntax=false;persist_batch_columnar_format=structured;persist_batch_delete_enabled=true;persist_batch_structured_order=true;persist_batch_builder_structured=true;persist_batch_structured_key_lower_len=256;persist_batch_max_run_len=4;persist_catalog_force_compaction_fuel=1024;persist_catalog_force_compaction_wait=1s;persist_stats_audit_percent=100;persist_stats_audit_panic=true;persist_encoding_enable_dictionary=true;persist_fast_path_limit=1000;persist_fast_path_order=true;persist_gc_use_active_gc=true;persist_gc_min_versions=16;persist_gc_max_versions=128000;persist_inline_writes_single_max_bytes=4096;persist_inline_writes_total_max_bytes=1048576;persist_pubsub_client_enabled=true;persist_pubsub_push_diff_enabled=true;persist_record_compactions=true;persist_record_schema_id=true;persist_rollup_use_active_rollup=true;persist_blob_target_size=16777216;persist_compaction_memory_bound_bytes=83886080;persist_enable_incremental_compaction=true;persist_use_critical_since_catalog=true;persist_use_critical_since_snapshot=false;persist_use_critical_since_source=false;persist_part_decode_format=arrow;persist_blob_cache_scale_with_threads=true;persist_state_update_lease_timeout=1s;arrangement_size_history_collection_interval=1h;arrangement_size_history_retention_period=7d;persist_validate_part_bounds_on_read=false;persist_validate_part_bounds_on_write=false;statement_logging_default_sample_rate=1.0;statement_logging_max_data_credit=;statement_logging_max_sample_rate=1.0;statement_logging_target_data_rate=;storage_reclock_to_latest=true;storage_source_decode_fuel=100000;storage_statistics_collection_interval=1000;storage_statistics_interval=2000;storage_use_continual_feedback_upsert=true;default_cluster_replication_factor=1 - MZ_NO_EXTERNAL_CLUSTERD=1 - MZ_TIMESTAMP_ORACLE_URL=postgres://root@postgres-metadata:26257?options=--search_path=tsoracle - MZ_NO_BUILTIN_POSTGRES=1 @@ -307,7 +319,7 @@ services: start_period: 600s stop_grace_period: 120s platform: linux/amd64 - image: materialize-materialized:latest + image: ghcr.io/materializeinc/materialize/materialized:mzbuild-IIJDLZ77L7R7ZOGPATZYUWRESDF5LDN7 workload: depends_on: materialized: @@ -321,7 +333,7 @@ services: - KAFKA_BROKER=kafka:9092 - SCHEMA_REGISTRY_URL=http://schema-registry:8081 platform: linux/amd64 - image: materialize-workload:latest + image: ghcr.io/materializeinc/materialize/antithesis-workload:mzbuild-4ENC44FVTZ7WPGVUTKUVI5N7CMOJS2O2 networks: {} volumes: mzdata: null diff --git a/test/antithesis/export-compose.py b/test/antithesis/export-compose.py index 80b20b2d9e9f6..081ce78ed41db 100644 --- a/test/antithesis/export-compose.py +++ b/test/antithesis/export-compose.py @@ -11,102 +11,199 @@ """Export the resolved docker-compose YAML for the Antithesis composition. -Loads the mzcompose composition and dumps the compose dict to stdout as -YAML — without building any images or requiring a running Docker daemon. +Loads `test/antithesis/mzcompose.py`, resolves every `mzbuild:` reference, +and dumps the resulting docker-compose dict to stdout. Antithesis pulls the +referenced images directly from public GHCR — no separate registry, no +re-tagging. -mzbuild references are replaced with public images where possible, -or local tags for images that must be built (e.g. the workload). +Image-reference policy: + + * Materialize-built images (`materialized`, `antithesis-workload`) are + emitted as `ghcr.io/materializeinc/materialize/:mzbuild-`. + The fingerprint participates in `antithesis=True` so antithesis builds + don't collide with regular builds. + + * Third-party `mzbuild` images (`postgres`, `minio`) are replaced with the + public upstream image. Our mzbuild variants bake in test-friendly + patches (eatmydata, no_fsync) that defeat Antithesis's fault injection; + Antithesis runs against vanilla. + +The script also strips mzcompose-only keys, host bind-mounts, and host-path +env vars that don't resolve inside the Antithesis sandbox, and inlines the +postgres bootstrap SQL into the entrypoint (the bind-mount path won't +exist). Usage: - bin/pyactivate test/antithesis/export-compose.py > antithesis/config/docker-compose.yaml + bin/pyactivate test/antithesis/export-compose.py \\ + > test/antithesis/config/docker-compose.yaml """ import sys from pathlib import Path +from typing import Any import yaml +from materialize import MZ_ROOT from materialize.mzbuild import Repository from materialize.mzcompose.composition import Composition -# Map mzbuild names → image references for the Antithesis compose. -# Public images for infra; local build tag for the workload. -MZBUILD_TO_IMAGE = { - "materialized": "materialize-materialized:latest", +# mzbuild image names that we publish to GHCR and want Antithesis to pull +# under our fingerprint. Everything else falls back to a public image. +MATERIALIZE_IMAGES = {"materialized", "antithesis-workload"} + +# Public-image fallbacks for mzbuild images whose Materialize-specific +# customizations subvert Antithesis (eatmydata, fsync no-ops, etc.). +PUBLIC_FALLBACKS = { "postgres": "postgres:17.7", "minio": "minio/minio:latest", - "antithesis-workload": "materialize-workload:latest", } -repo = Repository(Path("."), arch="x86_64") -c = Composition(repo, "antithesis", munge_services=False) - -for name, svc in c.compose["services"].items(): - svc["platform"] = "linux/amd64" - - if "mzbuild" in svc: - mzbuild_name = svc.pop("mzbuild") - if mzbuild_name not in MZBUILD_TO_IMAGE: - print( - f"warning: no image mapping for mzbuild {mzbuild_name!r}, " - f"using {mzbuild_name}:latest", - file=sys.stderr, - ) - svc["image"] = f"{mzbuild_name}:latest" - else: - svc["image"] = MZBUILD_TO_IMAGE[mzbuild_name] - - # Fixups for vanilla postgres (the mzbuild image has eatmydata, custom - # pg_hba.conf, and baked-in init SQL — none of which exist in the public image). - if svc.get("image", "").startswith("postgres:"): - env = svc.get("environment", []) - # Remove eatmydata — not installed in vanilla postgres - env[:] = [e for e in env if not e.startswith("LD_PRELOAD=")] - # Trust auth so materialized can connect as root without a password - env.append("POSTGRES_HOST_AUTH_METHOD=trust") - # Remove host bind-mount for setup SQL — won't exist in Antithesis. - # Instead, inline the init SQL that creates the schemas materialized needs. - vols = svc.get("volumes", []) - vols[:] = [v for v in vols if "setup_materialize.sql" not in v] - if not vols: - del svc["volumes"] - # Inline the init SQL as a script volume - init_sql = ( - "CREATE ROLE root WITH LOGIN PASSWORD 'root';" - "CREATE DATABASE root;" - "GRANT ALL PRIVILEGES ON DATABASE root TO root;" - r"\c root;" - "CREATE SCHEMA IF NOT EXISTS consensus AUTHORIZATION root;" - "CREATE SCHEMA IF NOT EXISTS adapter AUTHORIZATION root;" - "CREATE SCHEMA IF NOT EXISTS storage AUTHORIZATION root;" - "CREATE SCHEMA IF NOT EXISTS tsoracle AUTHORIZATION root;" - "GRANT ALL PRIVILEGES ON SCHEMA public TO root;" +# Header prepended to the generated YAML so check-copyright passes and +# readers know the file isn't hand-edited. +HEADER = """\ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +# GENERATED FILE — do not edit. Regenerate via: +# bin/pyactivate test/antithesis/export-compose.py > test/antithesis/config/docker-compose.yaml +# Source of truth: test/antithesis/mzcompose.py. + +""" + + +def resolve_mzbuild(svc: dict[str, Any], deps: Any) -> None: + """Replace `mzbuild:` with a concrete `image:` ref.""" + name = svc.pop("mzbuild") + if name in MATERIALIZE_IMAGES: + svc["image"] = deps[name].spec() + elif name in PUBLIC_FALLBACKS: + svc["image"] = PUBLIC_FALLBACKS[name] + else: + raise ValueError( + f"mzbuild image {name!r} has no Antithesis policy — add it to " + f"MATERIALIZE_IMAGES (use our GHCR build) or PUBLIC_FALLBACKS " + f"(swap to a public image) in export-compose.py." ) - svc.setdefault("entrypoint", []) - svc["entrypoint"] = ["sh", "-c", f""" -echo "{init_sql}" > /docker-entrypoint-initdb.d/z_setup_materialize.sql -exec docker-entrypoint.sh "$$@" -""".strip(), "--"] - - # Strip host bind-mounts — they won't resolve in Antithesis - if "volumes" in svc: - svc["volumes"] = [ - v for v in svc["volumes"] - if not isinstance(v, str) or ":" not in v or not v.split(":")[0].startswith("/") - ] - if not svc["volumes"]: - del svc["volumes"] - - # Remove env vars that point at host-only paths (the Docker image - # entrypoint provides sensible defaults when these are unset) - if "environment" in svc: - svc["environment"] = [ - e for e in svc["environment"] - if not e.startswith(("MZ_LISTENERS_CONFIG_PATH=", "MZ_EXTERNAL_LOGIN_PASSWORD_")) - ] - - # Drop mzcompose-only keys that docker/podman compose doesn't understand - for key in ["propagate_uid_gid", "allow_host_ports", "publish"]: + + +def inline_postgres_setup(svc: dict[str, Any]) -> None: + """Replace the bind-mounted setup SQL with an inline entrypoint write. + + Antithesis has no host filesystem, so we can't mount the SQL file. + Read it from misc/postgres/setup_materialize.sql (one source of truth) + and bake it into the service entrypoint. + """ + if not svc.get("image", "").startswith("postgres:"): + return + + env = svc.setdefault("environment", []) + # eatmydata isn't installed in the public postgres image. + env[:] = [e for e in env if not e.startswith("LD_PRELOAD=")] + # Trust auth — Antithesis-internal traffic only. + env.append("POSTGRES_HOST_AUTH_METHOD=trust") + + # Drop the bind-mounted setup SQL; we'll inline it. + vols = svc.get("volumes", []) + vols[:] = [v for v in vols if "setup_materialize.sql" not in v] + if not vols: + svc.pop("volumes", None) + + setup_sql = (MZ_ROOT / "misc" / "postgres" / "setup_materialize.sql").read_text() + # Strip comment lines + collapse to one statement per output line so we + # can safely double-quote it inside the sh -c here. + setup_sql = "\n".join( + line for line in setup_sql.splitlines() if line and not line.startswith("--") + ) + svc["entrypoint"] = [ + "sh", + "-c", + # `$$@` survives compose's $-interpolation and arrives as `$@` at the + # shell, forwarding any args (e.g., the `postgres` CMD) verbatim. + f"cat <<'SQL' > /docker-entrypoint-initdb.d/z_setup_materialize.sql\n" + f"{setup_sql}\n" + f"SQL\n" + f'exec docker-entrypoint.sh "$$@"', + "--", + ] + + +def strip_host_bindmounts(svc: dict[str, Any]) -> None: + """Drop volume entries that bind-mount a host path.""" + if "volumes" not in svc: + return + svc["volumes"] = [ + v + for v in svc["volumes"] + if not isinstance(v, str) + or ":" not in v + or not v.split(":", 1)[0].startswith("/") + ] + if not svc["volumes"]: + del svc["volumes"] + + +def strip_incompatible_env(svc: dict[str, Any]) -> None: + """Drop env vars that are unsafe or unresolvable under Antithesis. + + - `MZ_EAT_MY_DATA` enables `libeatmydata.so` (fsync no-op) — fatal for + crash-recovery testing under fault injection. + - `MZ_LISTENERS_CONFIG_PATH` and `MZ_EXTERNAL_LOGIN_PASSWORD_*` reference + host paths or host secrets that don't exist in the sandbox. + - Bare env vars (no `=`) inherit from the host environment, which is + empty under Antithesis; drop them so materialized's built-in defaults + apply. + """ + if "environment" not in svc: + return + drop_prefixes = ( + "MZ_EAT_MY_DATA=", + "MZ_LISTENERS_CONFIG_PATH=", + "MZ_EXTERNAL_LOGIN_PASSWORD_", + ) + svc["environment"] = [ + e for e in svc["environment"] if "=" in e and not e.startswith(drop_prefixes) + ] + + +def strip_mzcompose_keys(svc: dict[str, Any]) -> None: + """Drop keys understood by mzcompose but not by docker/podman compose.""" + for key in ("propagate_uid_gid", "allow_host_ports", "publish"): svc.pop(key, None) -yaml.dump(c.compose, sys.stdout, default_flow_style=False, sort_keys=False) + +def main() -> None: + # munge_services=False keeps ports bare (e.g., `6875` instead of + # `127.0.0.1::6875`) — Antithesis is container-to-container, no host + # binding. We do our own mzbuild→image substitution below. + repo = Repository(Path("."), arch="x86_64", antithesis=True) + c = Composition(repo, "antithesis", munge_services=False) + + images = [ + repo.images[svc["mzbuild"]] + for svc in c.compose["services"].values() + if "mzbuild" in svc + ] + deps = repo.resolve_dependencies(images) + + for svc in c.compose["services"].values(): + svc["platform"] = "linux/amd64" + if "mzbuild" in svc: + resolve_mzbuild(svc, deps) + inline_postgres_setup(svc) + strip_host_bindmounts(svc) + strip_incompatible_env(svc) + strip_mzcompose_keys(svc) + + sys.stdout.write(HEADER) + yaml.dump(c.compose, sys.stdout, default_flow_style=False, sort_keys=False) + + +if __name__ == "__main__": + main() diff --git a/test/antithesis/mzcompose.py b/test/antithesis/mzcompose.py index d84b0f0108bd5..c5320b38b0f80 100644 --- a/test/antithesis/mzcompose.py +++ b/test/antithesis/mzcompose.py @@ -18,15 +18,11 @@ - workload: Python test driver with Antithesis SDK Usage: - bin/mzcompose --find antithesis run default # bring up the cluster - bin/mzcompose --find antithesis run export-compose # dump compose YAML + bin/mzcompose --find antithesis run default # bring up the cluster + bin/pyactivate test/antithesis/export-compose.py > config/... # dump compose YAML """ -import sys - -import yaml - -from materialize.mzcompose.composition import Composition, WorkflowArgumentParser +from materialize.mzcompose.composition import Composition from materialize.mzcompose.service import Service, ServiceConfig from materialize.mzcompose.services.materialized import Materialized from materialize.mzcompose.services.minio import Minio @@ -76,13 +72,3 @@ def workflow_default(c: Composition) -> None: c.up("postgres-metadata", "minio", "redpanda") c.up("materialized") c.up("workload") - - -def workflow_export_compose(c: Composition) -> None: - """Export the resolved docker-compose YAML to stdout. - - Usage: - bin/mzcompose --find antithesis run export-compose > antithesis/config/docker-compose.yaml - """ - # c.compose is the fully-resolved compose dict (mzbuild: replaced with image:) - yaml.dump(c.compose, sys.stdout, default_flow_style=False, sort_keys=False) From d7cc3c466c85a42e82bb78fa97bfd5d4052295fc Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Mon, 11 May 2026 14:02:50 -0400 Subject: [PATCH 08/65] test/antithesis: strip Makefile to mzbuild-driven build (drop registry/retag hacks) --- test/antithesis/Makefile | 138 +++++++++++++++------------------------ 1 file changed, 52 insertions(+), 86 deletions(-) diff --git a/test/antithesis/Makefile b/test/antithesis/Makefile index bee47a5d6f2c6..f25077b582d33 100644 --- a/test/antithesis/Makefile +++ b/test/antithesis/Makefile @@ -1,20 +1,35 @@ -# Build / run helper for the Materialize Antithesis harness. +# Copyright Materialize, Inc. and contributors. All rights reserved. # -# Usage: -# make build # build every local image -# make up # export compose, build, bring up the stack -# make test # smoke test against the running cluster -# make push # push locally-built images to Antithesis registry -# make down # tear down (preserves volumes) -# make clean # tear down + remove volumes + images -# make smoke # full cycle: build → up → test +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +# Local-dev helper for the Materialize Antithesis harness. +# +# Antithesis images ship via the standard mzbuild → GHCR flow; CI publishes +# the same images CI publishes for everything else, fingerprint-tagged with +# `mzbuild-`. Locally, we just acquire the mzbuild images, regenerate +# the compose YAML, and let `docker compose` find them by their canonical +# spec. +# +# Targets: +# make build # regenerate compose YAML, acquire local mzbuild images +# make up # build + bring up the stack +# make down # tear down (preserves volumes) +# make smoke # build + up + smoke test +# make test # smoke test against a running stack +# make clean # tear down + remove volumes SHELL := /usr/bin/env bash .SHELLFLAGS := -eu -o pipefail -c -PROJECT := materialize +PROJECT := materialize-antithesis REPO_ROOT := $(realpath $(dir $(lastword $(MAKEFILE_LIST)))/../..) +# Pick podman if available, else docker. ifndef RUNTIME RUNTIME := $(shell command -v podman >/dev/null 2>&1 && echo podman || (command -v docker >/dev/null 2>&1 && echo docker || echo none)) endif @@ -25,104 +40,55 @@ ifeq ($(RUNTIME),podman) export MZ_DEV_CI_BUILDER_RUNTIME := podman endif -COMPOSE := $(RUNTIME) compose -p $(PROJECT) -f config/docker-compose.yaml -PSQL := $(COMPOSE) exec materialized psql -h localhost -p 6875 -U materialize +COMPOSE_FILE := $(REPO_ROOT)/test/antithesis/config/docker-compose.yaml +COMPOSE := $(RUNTIME) compose -p $(PROJECT) -f $(COMPOSE_FILE) +PSQL := $(COMPOSE) exec materialized psql -h localhost -p 6875 -U materialize -REGISTRY ?= us-central1-docker.pkg.dev -REGISTRY_PATH ?= /molten-verve-216720/materialize-repository +# mzbuild images we need built locally. Third-party images (postgres, minio, +# redpanda) are pulled by `docker compose` from their upstream registries. +MZBUILD_IMAGES := materialized antithesis-workload # --------------------------------------------------------------------------- -# Export — generate the resolved docker-compose YAML for Antithesis. +# Build # --------------------------------------------------------------------------- -.PHONY: export-compose +.PHONY: build export-compose acquire-images + +build: export-compose acquire-images + export-compose: - cd $(REPO_ROOT) && $(REPO_ROOT)/bin/pyactivate test/antithesis/export-compose.py > test/antithesis/config/docker-compose.yaml - @echo "Wrote test/antithesis/config/docker-compose.yaml" + cd $(REPO_ROOT) && bin/pyactivate test/antithesis/export-compose.py > $(COMPOSE_FILE) + @echo "Wrote $(COMPOSE_FILE)" -# --------------------------------------------------------------------------- -# Build — build images that don't have public equivalents. -# --------------------------------------------------------------------------- -LOCAL_IMAGES := materialized workload config -BUILD_TARGETS := $(LOCAL_IMAGES:%=build-%) - -.PHONY: build $(BUILD_TARGETS) build-builder -build: export-compose $(BUILD_TARGETS) - -build-builder: - cd $(REPO_ROOT) && $(REPO_ROOT)/bin/ci-builder build stable --load - @tag=$$(cd $(REPO_ROOT) && $(REPO_ROOT)/bin/ci-builder tag stable); \ - echo "Tagging materialize/ci-builder:$$tag -> $(PROJECT)-builder:latest"; \ - $(RUNTIME) tag "materialize/ci-builder:$$tag" $(PROJECT)-builder:latest - -build-materialized: build-builder - cd $(REPO_ROOT) && $(REPO_ROOT)/bin/mzimage acquire materialized --antithesis - @# Tag the mzbuild output to the name the compose file expects - @img=$$($(RUNTIME) images --format '{{.Repository}}:{{.Tag}}' \ - | grep 'materialized:mzbuild-' | head -1); \ - echo "Tagging $$img -> $(PROJECT)-materialized:latest"; \ - $(RUNTIME) tag "$$img" $(PROJECT)-materialized:latest - -build-workload: - $(RUNTIME) build \ - --platform linux/amd64 \ - -t $(PROJECT)-workload:latest \ - $(REPO_ROOT)/test/antithesis/workload - -build-config: export-compose - $(RUNTIME) build \ - --platform linux/amd64 \ - -t $(PROJECT)-config:latest \ - config +acquire-images: + @for image in $(MZBUILD_IMAGES); do \ + echo "--- Acquiring $$image (--antithesis)"; \ + cd $(REPO_ROOT) && bin/mzimage acquire "$$image" --antithesis; \ + done # --------------------------------------------------------------------------- # Up / Down # --------------------------------------------------------------------------- -.PHONY: up -up: export-compose build +.PHONY: up down clean + +up: build $(COMPOSE) up -d -.PHONY: down down: $(COMPOSE) down +clean: down + $(COMPOSE) down -v --remove-orphans 2>/dev/null || true + # --------------------------------------------------------------------------- -# Test — quick smoke test against the running cluster +# Test # --------------------------------------------------------------------------- -.PHONY: test +.PHONY: test smoke + test: $(PSQL) -c "CREATE TABLE IF NOT EXISTS smoke_test (k INT, v TEXT)" $(PSQL) -c "INSERT INTO smoke_test VALUES (1, 'hello'), (2, 'world')" $(PSQL) -c "SELECT * FROM smoke_test ORDER BY k" $(PSQL) -c "DROP TABLE smoke_test" -# --------------------------------------------------------------------------- -# Push — tag local images and push to the Antithesis registry -# --------------------------------------------------------------------------- -.PHONY: push -push: - @$(RUNTIME) images --format '{{.Repository}}:{{.Tag}}' \ - | grep ':latest$$' | grep '^\(localhost/\)\?$(PROJECT)-' \ - | grep -v '$(PROJECT)-builder' \ - | while read item; do \ - nametag="$${item#localhost/}"; \ - name="$${nametag%:*}"; \ - remote="$(REGISTRY)$(REGISTRY_PATH)/$${name}:latest"; \ - echo "Pushing $${item} -> $${remote}"; \ - $(RUNTIME) tag "$${item}" "$${remote}" || exit 1; \ - $(RUNTIME) push "$${remote}" || exit 1; \ - done - -# --------------------------------------------------------------------------- -# Clean -# --------------------------------------------------------------------------- -.PHONY: clean -clean: down - $(COMPOSE) down -v --remove-orphans 2>/dev/null || true - -$(RUNTIME) rmi $$($(RUNTIME) images --format '{{.Repository}}:{{.Tag}}' | grep '^$(PROJECT)-' || true) 2>/dev/null - -# --------------------------------------------------------------------------- -# Smoke — full cycle: build → up → test -# --------------------------------------------------------------------------- -.PHONY: smoke smoke: up test @echo "[smoke] passed" From 106c9a9feb3541232d6dd2c6e3b41fa879ec3e09 Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Mon, 11 May 2026 14:03:43 -0400 Subject: [PATCH 09/65] ci: nightly antithesis builds via CI_ANTITHESIS env passthrough --- ci/mkpipeline.py | 32 ++++++++++++++++++++++ ci/nightly/pipeline.template.yml | 19 +++++++++++++ ci/test/build.py | 2 ++ ci/test/lint-main/checks/check-pipeline.sh | 1 + 4 files changed, 54 insertions(+) diff --git a/ci/mkpipeline.py b/ci/mkpipeline.py index 79fcb7bd2a0c9..d6be6018c7532 100644 --- a/ci/mkpipeline.py +++ b/ci/mkpipeline.py @@ -121,6 +121,12 @@ def main() -> int: type=Sanitizer, choices=Sanitizer, ) + parser.add_argument( + "--antithesis", + action="store_true", + default=ui.env_is_truthy("CI_ANTITHESIS"), + help="enable Antithesis coverage instrumentation", + ) parser.add_argument( "--priority", type=int, @@ -166,6 +172,7 @@ def get_hashes(arch: Arch) -> tuple[str, bool]: arch=arch, coverage=args.coverage, sanitizer=args.sanitizer, + antithesis=args.antithesis, ) deps = repo.resolve_dependencies(image for image in repo if image.publish) check = deps.check() @@ -209,6 +216,7 @@ def fetch_hashes() -> None: args.coverage, args.sanitizer, lto, + args.antithesis, ) trim_ci_glue_exempt_steps(pipeline) else: @@ -218,9 +226,11 @@ def fetch_hashes() -> None: args.coverage, args.sanitizer, lto, + args.antithesis, ) truncate_skip_length(pipeline) handle_sanitizer_skip(pipeline, args.sanitizer) + handle_antithesis_skip(pipeline, args.antithesis) increase_agents_timeouts(pipeline, args.sanitizer, args.coverage) prioritize_pipeline(pipeline, args.priority) switch_jobs_to_aws(pipeline, args.priority) @@ -240,6 +250,7 @@ def fetch_hashes() -> None: args.coverage, args.sanitizer, lto, + args.antithesis, ) add_nightly_deploy_dependency(pipeline, args.pipeline) remove_dependencies_on_prs(pipeline, args.pipeline, hash_check) @@ -328,6 +339,21 @@ def handle_sanitizer_skip(pipeline: Any, sanitizer: Sanitizer) -> None: step["skip"] = True +def handle_antithesis_skip(pipeline: Any, antithesis: bool) -> None: + if antithesis: + pipeline.setdefault("env", {})["CI_ANTITHESIS"] = "1" + + for step in steps(pipeline): + if step.get("antithesis") == "skip": + step["skip"] = True + + else: + + for step in steps(pipeline): + if step.get("antithesis") == "only": + step["skip"] = True + + def increase_agents_timeouts( pipeline: Any, sanitizer: Sanitizer, coverage: bool ) -> None: @@ -711,6 +737,7 @@ def trim_tests_pipeline( coverage: bool, sanitizer: Sanitizer, lto: bool, + antithesis: bool = False, ) -> None: """Trim pipeline steps whose inputs have not changed in this branch. @@ -731,6 +758,7 @@ def trim_tests_pipeline( profile=mzbuild.Profile.RELEASE if lto else mzbuild.Profile.OPTIMIZED, coverage=coverage, sanitizer=sanitizer, + antithesis=antithesis, ) deps = repo.resolve_dependencies(image for image in repo) @@ -917,6 +945,7 @@ def add_cargo_test_dependency( coverage: bool, sanitizer: Sanitizer, lto: bool, + antithesis: bool = False, ) -> None: """Cargo Test normally doesn't have to wait for the build to complete, but it requires a few images (ubuntu-base, postgres), which are rarely changed. So only add a dependency when those images are not on Dockerhub yet.""" if pipeline_name not in ("test", "nightly"): @@ -933,6 +962,7 @@ def add_cargo_test_dependency( profile=mzbuild.Profile.RELEASE if lto else mzbuild.Profile.OPTIMIZED, coverage=coverage, sanitizer=sanitizer, + antithesis=antithesis, ) composition = Composition(repo, name="cargo-test") deps = composition.dependencies @@ -1090,6 +1120,8 @@ def remove_mz_specific_keys(pipeline: Any) -> None: del step["coverage"] if "sanitizer" in step: del step["sanitizer"] + if "antithesis" in step: + del step["antithesis"] if "ci_glue_exempt" in step: del step["ci_glue_exempt"] if ( diff --git a/ci/nightly/pipeline.template.yml b/ci/nightly/pipeline.template.yml index d10055451b451..f1cf3e4c7ed69 100644 --- a/ci/nightly/pipeline.template.yml +++ b/ci/nightly/pipeline.template.yml @@ -65,6 +65,25 @@ steps: branches: "main" skip: "currently broken" + - id: build-x86_64-antithesis + label: ":rust: Build x86_64 (Antithesis)" + command: bin/ci-builder run stable bin/pyactivate -m ci.test.build + inputs: + - "*" + depends_on: [] + timeout_in_minutes: 90 + agents: + queue: l-builder-linux-x86_64 + env: + CI_ANTITHESIS: "1" + # Antithesis-flavored images get distinct mzbuild fingerprints, so + # they coexist with regular GHCR tags. The build is x86_64-only — + # Antithesis runs amd64 sandboxes. + sanitizer: skip + coverage: skip + antithesis: skip + branches: "main" + - id: build-rust-latest-beta label: "Build with Latest Rust Beta" command: bin/ci-builder run stable ci/test/rust-beta-build.sh diff --git a/ci/test/build.py b/ci/test/build.py index d91e82ffe2734..5b18ce91e9b31 100755 --- a/ci/test/build.py +++ b/ci/test/build.py @@ -34,11 +34,13 @@ def main() -> None: set_build_status("pending") coverage = ui.env_is_truthy("CI_COVERAGE_ENABLED") sanitizer = Sanitizer[os.getenv("CI_SANITIZER", "none")] + antithesis = ui.env_is_truthy("CI_ANTITHESIS") repo = mzbuild.Repository( Path("."), coverage=coverage, sanitizer=sanitizer, + antithesis=antithesis, image_registry="materialize", ) diff --git a/ci/test/lint-main/checks/check-pipeline.sh b/ci/test/lint-main/checks/check-pipeline.sh index baed7ae9a717c..95da47ae547c8 100755 --- a/ci/test/lint-main/checks/check-pipeline.sh +++ b/ci/test/lint-main/checks/check-pipeline.sh @@ -28,6 +28,7 @@ unset CI_TEST_IDS unset CI_TEST_SELECTION unset CI_SANITIZER unset CI_COVERAGE_ENABLED +unset CI_ANTITHESIS unset CI_WAITING_FOR_BUILD pids=() From e0214600b7c9497a71b82508a106e050e1b32082 Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Mon, 11 May 2026 14:13:50 -0400 Subject: [PATCH 10/65] ci: lint check that test/antithesis compose YAML matches mzcompose.py --- .../checks/check-antithesis-compose.sh | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100755 ci/test/lint-main/checks/check-antithesis-compose.sh diff --git a/ci/test/lint-main/checks/check-antithesis-compose.sh b/ci/test/lint-main/checks/check-antithesis-compose.sh new file mode 100755 index 0000000000000..add2f2a0dab57 --- /dev/null +++ b/ci/test/lint-main/checks/check-antithesis-compose.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash + +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. +# +# check-antithesis-compose.sh — ensure test/antithesis/config/docker-compose.yaml +# is in sync with test/antithesis/mzcompose.py. +# +# Fingerprint refs (`mzbuild-`) shift on every materialized code change, +# so we mask them before diffing — we only want to catch composition +# (services, ports, env, deps) drift, not transient fingerprint churn. + +set -euo pipefail + +cd "$(dirname "$0")/../../../.." + +. misc/shlib/shlib.bash + +check_antithesis_compose() { + local committed=test/antithesis/config/docker-compose.yaml + local generated rc=0 + generated=$(mktemp) + + bin/pyactivate test/antithesis/export-compose.py > "$generated" + + # Mask `mzbuild-` so the diff is structural-only. + local mask='s/(mzbuild-)[A-Z0-9]+/\1FINGERPRINT/g' + if ! diff -u \ + <(sed -E "$mask" "$committed") \ + <(sed -E "$mask" "$generated"); then + echo + echo "$committed is out of sync with test/antithesis/mzcompose.py." + echo "Regenerate with:" + echo " bin/pyactivate test/antithesis/export-compose.py > $committed" + rc=1 + fi + + rm -f "$generated" + return $rc +} + +try check_antithesis_compose + +try_status_report From ff5c6d79b14d129df416bed54f7f87139e75ff8d Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Mon, 11 May 2026 14:24:08 -0400 Subject: [PATCH 11/65] ci: drop branches:main on build-x86_64-antithesis (validating) --- ci/nightly/pipeline.template.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/ci/nightly/pipeline.template.yml b/ci/nightly/pipeline.template.yml index f1cf3e4c7ed69..065fbe0488b6f 100644 --- a/ci/nightly/pipeline.template.yml +++ b/ci/nightly/pipeline.template.yml @@ -82,7 +82,6 @@ steps: sanitizer: skip coverage: skip antithesis: skip - branches: "main" - id: build-rust-latest-beta label: "Build with Latest Rust Beta" From 0f59e7d82451b64c4c9a4d4b9ddb40666981d602 Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Mon, 11 May 2026 14:31:08 -0400 Subject: [PATCH 12/65] test/antithesis: switch to Kafka stack + external clusterd --- test/antithesis/config/docker-compose.yaml | 147 ++++++++++++++---- test/antithesis/mzcompose.py | 50 ++++-- .../workload/workload-entrypoint.sh | 33 +++- 3 files changed, 188 insertions(+), 42 deletions(-) diff --git a/test/antithesis/config/docker-compose.yaml b/test/antithesis/config/docker-compose.yaml index dc75d1e5f2ef8..26819190cd164 100644 --- a/test/antithesis/config/docker-compose.yaml +++ b/test/antithesis/config/docker-compose.yaml @@ -93,44 +93,123 @@ services: start_period: 30s platform: linux/amd64 image: minio/minio:latest - redpanda: - image: redpandadata/redpanda:v25.2.11 + zookeeper: + image: confluentinc/cp-zookeeper:7.9.4 + ports: + - 2181 + environment: + - ZOOKEEPER_CLIENT_PORT=2181 + healthcheck: + test: + - CMD + - nc + - -z + - localhost + - '2181' + interval: 1s + start_period: 120s + platform: linux/amd64 + kafka: + image: confluentinc/cp-kafka:7.9.4 + ports: + - '9092' + environment: + - KAFKA_ZOOKEEPER_CONNECT=zookeeper:2181 + - KAFKA_CONFLUENT_SUPPORT_METRICS_ENABLE=false + - KAFKA_MIN_INSYNC_REPLICAS=1 + - KAFKA_OFFSETS_TOPIC_NUM_PARTITIONS=1 + - KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR=1 + - KAFKA_TRANSACTION_STATE_LOG_MIN_ISR=1 + - KAFKA_MESSAGE_MAX_BYTES=15728640 + - KAFKA_REPLICA_FETCH_MAX_BYTES=15728640 + - KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS=100 + - KAFKA_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092 + - KAFKA_BROKER_ID=1 + - KAFKA_AUTO_CREATE_TOPICS_ENABLE=True + - KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR=1 + depends_on: + zookeeper: + condition: service_started + healthcheck: + test: + - CMD + - nc + - -z + - localhost + - '9092' + interval: 1s + start_period: 120s + platform: linux/amd64 + schema-registry: + image: confluentinc/cp-schema-registry:7.9.4 ports: - - 9092 - 8081 - command: - - redpanda - - start - - --overprovisioned - - --smp=1 - - --memory=1G - - --reserve-memory=0M - - --node-id=0 - - --check=false - - --set - - redpanda.enable_transactions=true - - --set - - redpanda.enable_idempotence=true - - --set - - redpanda.auto_create_topics_enabled=True - - --set - - redpanda.topic_memory_per_partition=4096 - - --set - - --advertise-kafka-addr=kafka:9092 networks: default: - aliases: - - kafka - - schema-registry + aliases: [] + environment: + - SCHEMA_REGISTRY_KAFKASTORE_TIMEOUT_MS=10000 + - SCHEMA_REGISTRY_KAFKASTORE_TOPIC_REPLICATION_FACTOR=1 + - SCHEMA_REGISTRY_HOST_NAME=schema-registry + - SCHEMA_REGISTRY_KAFKASTORE_BOOTSTRAP_SERVERS=PLAINTEXT://kafka:9092 + command: + - /bin/bash + - -c + - . /etc/confluent/docker/bash-config && . /etc/confluent/docker/mesos-setup.sh + && . /etc/confluent/docker/apply-mesos-overrides && /etc/confluent/docker/configure + && exec /etc/confluent/docker/launch + depends_on: + kafka: + condition: service_started healthcheck: test: - CMD - curl - - -f - - localhost:9644/v1/status/ready + - -fu + - materialize:sekurity + - localhost:8081 interval: 1s start_period: 120s platform: linux/amd64 + clusterd1: + entrypoint: + - tini + - -- + command: + - clusterd + - --scratch-directory=/scratch + ports: + - 2100 + - 2101 + - 6878 + environment: + - CLUSTERD_GRPC_HOST=clusterd1 + - CLUSTERD_USE_CTP=true + - MZ_SOFT_ASSERTIONS=1 + - CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2100 + - CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2101 + - CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=0.0.0.0:6878 + - CLUSTERD_SECRETS_READER=local-file + - CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=/mzdata/secrets + - LD_PRELOAD=libeatmydata.so + - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879 + - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0 + - CLUSTERD_PROCESS=0 + - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 1, "process": 0, "addresses": ["clusterd1:2102"], + "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc": + false, "zero_copy_limit": null}' + - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 1, "process": 0, "addresses": ["clusterd1:2103"], + "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc": + false, "zero_copy_limit": null}' + volumes: + - mzdata:/mzdata + - mydata:/var/lib/mysql-files + - tmp:/share/tmp + - scratch:/scratch + restart: 'no' + stop_grace_period: 120s + platform: linux/amd64 + image: ghcr.io/materializeinc/materialize/materialized:mzbuild-IIJDLZ77L7R7ZOGPATZYUWRESDF5LDN7 materialized: hostname: materialized depends_on: @@ -296,8 +375,7 @@ services: - MZ_BOOTSTRAP_DEFAULT_CLUSTER_REPLICATION_FACTOR=1 - COCKROACH_ENGINE_MAX_SYNC_DURATION_DEFAULT=120s - COCKROACH_LOG_MAX_SYNC_DURATION=120s - - MZ_SYSTEM_PARAMETER_DEFAULT=unsafe_enable_unsafe_functions=true;allow_real_time_recency=true;constraint_based_timestamp_selection=verify;enable_compute_peek_response_stash=true;enable_0dt_deployment_panic_after_timeout=true;enable_0dt_deployment_sources=true;enable_alter_swap=true;enable_case_literal_transform=false;enable_cast_elimination=true;enable_coalesce_case_transform=true;enable_columnar_lgalloc=false;enable_columnation_lgalloc=false;enable_compute_correction_v2=true;enable_compute_logical_backpressure=true;enable_connection_validation_syntax=true;enable_copy_to_expr=true;enable_copy_from_remote=true;enable_create_table_from_source=true;enable_eager_delta_joins=true;enable_envelope_debezium_in_subscribe=true;enable_expressions_in_limit_syntax=true;enable_iceberg_sink=true;enable_introspection_subscribes=true;enable_kafka_sink_partition_by=true;enable_lgalloc=false;enable_load_generator_counter=true;enable_logical_compaction_window=true;enable_multi_worker_storage_persist_sink=true;enable_multi_replica_sources=true;enable_rbac_checks=true;enable_reduce_mfp_fusion=true;enable_refresh_every_mvs=true;enable_replacement_materialized_views=true;enable_cluster_schedule_refresh=true;enable_sql_server_source=true;enable_s3_tables_region_check=false;enable_statement_lifecycle_logging=true;enable_storage_introspection_logs=true;enable_compute_temporal_bucketing=true;enable_variadic_left_join_lowering=true;enable_worker_core_affinity=true;grpc_client_http2_keep_alive_timeout=5s;ore_overflowing_behavior=panic;unsafe_enable_table_keys=true;with_0dt_deployment_max_wait=1800s;persist_next_listen_batch_retryer_clamp=16s;persist_next_listen_batch_retryer_initial_backoff=100ms;persist_next_listen_batch_retryer_fixed_sleep=1200ms;persist_enable_arrow_lgalloc_noncc_sizes=true;persist_enable_s3_lgalloc_noncc_sizes=true;compute_correction_v2_chain_proportionality=3;compute_correction_v2_chunk_size=8192;compute_dataflow_max_inflight_bytes=134217728;compute_hydration_concurrency=2;compute_replica_expiration_offset=3d;compute_apply_column_demands=true;compute_peek_response_stash_threshold_bytes=1048576;compute_subscribe_snapshot_optimization=true;enable_compute_sync_mv_sink=true;enable_password_auth=true;enable_frontend_peek_sequencing=true;enable_frontend_subscribes=true;enable_upsert_v2=false;default_timestamp_interval=1s;force_source_table_syntax=false;persist_batch_columnar_format=structured;persist_batch_delete_enabled=true;persist_batch_structured_order=true;persist_batch_builder_structured=true;persist_batch_structured_key_lower_len=256;persist_batch_max_run_len=4;persist_catalog_force_compaction_fuel=1024;persist_catalog_force_compaction_wait=1s;persist_stats_audit_percent=100;persist_stats_audit_panic=true;persist_encoding_enable_dictionary=true;persist_fast_path_limit=1000;persist_fast_path_order=true;persist_gc_use_active_gc=true;persist_gc_min_versions=16;persist_gc_max_versions=128000;persist_inline_writes_single_max_bytes=4096;persist_inline_writes_total_max_bytes=1048576;persist_pubsub_client_enabled=true;persist_pubsub_push_diff_enabled=true;persist_record_compactions=true;persist_record_schema_id=true;persist_rollup_use_active_rollup=true;persist_blob_target_size=16777216;persist_compaction_memory_bound_bytes=83886080;persist_enable_incremental_compaction=true;persist_use_critical_since_catalog=true;persist_use_critical_since_snapshot=false;persist_use_critical_since_source=false;persist_part_decode_format=arrow;persist_blob_cache_scale_with_threads=true;persist_state_update_lease_timeout=1s;arrangement_size_history_collection_interval=1h;arrangement_size_history_retention_period=7d;persist_validate_part_bounds_on_read=false;persist_validate_part_bounds_on_write=false;statement_logging_default_sample_rate=1.0;statement_logging_max_data_credit=;statement_logging_max_sample_rate=1.0;statement_logging_target_data_rate=;storage_reclock_to_latest=true;storage_source_decode_fuel=100000;storage_statistics_collection_interval=1000;storage_statistics_interval=2000;storage_use_continual_feedback_upsert=true;default_cluster_replication_factor=1 - - MZ_NO_EXTERNAL_CLUSTERD=1 + - MZ_SYSTEM_PARAMETER_DEFAULT=unsafe_enable_unsafe_functions=true;allow_real_time_recency=true;constraint_based_timestamp_selection=verify;enable_compute_peek_response_stash=true;enable_0dt_deployment_panic_after_timeout=true;enable_0dt_deployment_sources=true;enable_alter_swap=true;enable_case_literal_transform=false;enable_cast_elimination=true;enable_coalesce_case_transform=true;enable_columnar_lgalloc=false;enable_columnation_lgalloc=false;enable_compute_correction_v2=true;enable_compute_logical_backpressure=true;enable_connection_validation_syntax=true;enable_copy_to_expr=true;enable_copy_from_remote=true;enable_create_table_from_source=true;enable_eager_delta_joins=true;enable_envelope_debezium_in_subscribe=true;enable_expressions_in_limit_syntax=true;enable_iceberg_sink=true;enable_introspection_subscribes=true;enable_kafka_sink_partition_by=true;enable_lgalloc=false;enable_load_generator_counter=true;enable_logical_compaction_window=true;enable_multi_worker_storage_persist_sink=true;enable_multi_replica_sources=true;enable_rbac_checks=true;enable_reduce_mfp_fusion=true;enable_refresh_every_mvs=true;enable_replacement_materialized_views=true;enable_cluster_schedule_refresh=true;enable_sql_server_source=true;enable_s3_tables_region_check=false;enable_statement_lifecycle_logging=true;enable_storage_introspection_logs=true;enable_compute_temporal_bucketing=true;enable_variadic_left_join_lowering=true;enable_worker_core_affinity=true;grpc_client_http2_keep_alive_timeout=5s;ore_overflowing_behavior=panic;unsafe_enable_table_keys=true;with_0dt_deployment_max_wait=1800s;persist_next_listen_batch_retryer_clamp=16s;persist_next_listen_batch_retryer_initial_backoff=100ms;persist_next_listen_batch_retryer_fixed_sleep=1200ms;persist_enable_arrow_lgalloc_noncc_sizes=true;persist_enable_s3_lgalloc_noncc_sizes=true;compute_correction_v2_chain_proportionality=3;compute_correction_v2_chunk_size=8192;compute_dataflow_max_inflight_bytes=134217728;compute_hydration_concurrency=2;compute_replica_expiration_offset=3d;compute_apply_column_demands=true;compute_peek_response_stash_threshold_bytes=1048576;compute_subscribe_snapshot_optimization=true;enable_compute_sync_mv_sink=true;enable_password_auth=true;enable_frontend_peek_sequencing=true;enable_frontend_subscribes=true;enable_upsert_v2=false;default_timestamp_interval=1s;force_source_table_syntax=false;persist_batch_columnar_format=structured;persist_batch_delete_enabled=true;persist_batch_structured_order=true;persist_batch_builder_structured=true;persist_batch_structured_key_lower_len=256;persist_batch_max_run_len=4;persist_catalog_force_compaction_fuel=1024;persist_catalog_force_compaction_wait=1s;persist_stats_audit_percent=100;persist_stats_audit_panic=true;persist_encoding_enable_dictionary=true;persist_fast_path_limit=1000;persist_fast_path_order=true;persist_gc_use_active_gc=true;persist_gc_min_versions=16;persist_gc_max_versions=128000;persist_inline_writes_single_max_bytes=4096;persist_inline_writes_total_max_bytes=1048576;persist_pubsub_client_enabled=true;persist_pubsub_push_diff_enabled=true;persist_record_compactions=true;persist_record_schema_id=true;persist_rollup_use_active_rollup=true;persist_blob_target_size=16777216;persist_compaction_memory_bound_bytes=83886080;persist_enable_incremental_compaction=true;persist_use_critical_since_catalog=true;persist_use_critical_since_snapshot=false;persist_use_critical_since_source=false;persist_part_decode_format=arrow;persist_blob_cache_scale_with_threads=true;persist_state_update_lease_timeout=1s;arrangement_size_history_collection_interval=1h;arrangement_size_history_retention_period=7d;persist_validate_part_bounds_on_read=false;persist_validate_part_bounds_on_write=false;statement_logging_default_sample_rate=1.0;statement_logging_max_data_credit=;statement_logging_max_sample_rate=1.0;statement_logging_target_data_rate=;storage_reclock_to_latest=true;storage_source_decode_fuel=100000;storage_statistics_collection_interval=1000;storage_statistics_interval=2000;storage_use_continual_feedback_upsert=true;default_cluster_replication_factor=1;unsafe_enable_unorchestrated_cluster_replicas=true - MZ_TIMESTAMP_ORACLE_URL=postgres://root@postgres-metadata:26257?options=--search_path=tsoracle - MZ_NO_BUILTIN_POSTGRES=1 - MZ_NO_BUILTIN_COCKROACH=1 @@ -324,16 +402,23 @@ services: depends_on: materialized: condition: service_healthy - redpanda: + clusterd1: + condition: service_started + kafka: condition: service_healthy + schema-registry: + condition: service_started environment: - PGHOST=materialized - PGPORT=6875 - PGUSER=materialize + - PGPORT_INTERNAL=6877 + - PGUSER_INTERNAL=mz_system - KAFKA_BROKER=kafka:9092 - SCHEMA_REGISTRY_URL=http://schema-registry:8081 + - MZ_ANTITHESIS_CLUSTER=antithesis_cluster platform: linux/amd64 - image: ghcr.io/materializeinc/materialize/antithesis-workload:mzbuild-4ENC44FVTZ7WPGVUTKUVI5N7CMOJS2O2 + image: ghcr.io/materializeinc/materialize/antithesis-workload:mzbuild-XX2UEHO746TTSXP3JUOIMJTYD2WWEBLY networks: {} volumes: mzdata: null diff --git a/test/antithesis/mzcompose.py b/test/antithesis/mzcompose.py index c5320b38b0f80..552dd1d21e824 100644 --- a/test/antithesis/mzcompose.py +++ b/test/antithesis/mzcompose.py @@ -10,12 +10,15 @@ """ Antithesis test composition for Materialize. -Defines the minimal topology needed to exercise Materialize under Antithesis: - - postgres-metadata: consensus/catalog store - - minio: S3-compatible blob storage for persist - - redpanda: Kafka-compatible broker for source ingestion - - materialized: the SUT (embedded clusterd mode) - - workload: Python test driver with Antithesis SDK +Topology exercised under Antithesis: + - postgres-metadata : consensus/catalog/timestamp-oracle store + - minio : S3-compatible blob storage for persist + - zookeeper + kafka : Kafka broker for source ingestion + - schema-registry : Avro/Protobuf schemas for kafka sources + - clusterd1 : external compute+storage process — fenceable + independently of materialized for fault testing + - materialized : the SUT (environmentd; clusterd is external) + - workload : Python test driver wired to the Antithesis SDK Usage: bin/mzcompose --find antithesis run default # bring up the cluster @@ -24,10 +27,13 @@ from materialize.mzcompose.composition import Composition from materialize.mzcompose.service import Service, ServiceConfig +from materialize.mzcompose.services.clusterd import Clusterd +from materialize.mzcompose.services.kafka import Kafka from materialize.mzcompose.services.materialized import Materialized from materialize.mzcompose.services.minio import Minio from materialize.mzcompose.services.postgres import PostgresMetadata -from materialize.mzcompose.services.redpanda import Redpanda +from materialize.mzcompose.services.schema_registry import SchemaRegistry +from materialize.mzcompose.services.zookeeper import Zookeeper class Workload(Service): @@ -38,14 +44,22 @@ def __init__(self) -> None: "mzbuild": "antithesis-workload", "depends_on": { "materialized": {"condition": "service_healthy"}, - "redpanda": {"condition": "service_healthy"}, + "clusterd1": {"condition": "service_started"}, + "kafka": {"condition": "service_healthy"}, + "schema-registry": {"condition": "service_started"}, }, "environment": [ "PGHOST=materialized", "PGPORT=6875", "PGUSER=materialize", + # Internal SQL port for system-privileged setup (CREATE CLUSTER). + "PGPORT_INTERNAL=6877", + "PGUSER_INTERNAL=mz_system", "KAFKA_BROKER=kafka:9092", "SCHEMA_REGISTRY_URL=http://schema-registry:8081", + # Name of the unmanaged cluster the workload-entrypoint + # provisions against clusterd1 before emitting setup-complete. + "MZ_ANTITHESIS_CLUSTER=antithesis_cluster", ], } super().__init__(name="workload", config=config) @@ -54,7 +68,10 @@ def __init__(self) -> None: SERVICES = [ PostgresMetadata(), Minio(setup_materialize=True), - Redpanda(auto_create_topics=True), + Zookeeper(), + Kafka(auto_create_topics=True), + SchemaRegistry(), + Clusterd(name="clusterd1"), Materialized( external_blob_store=True, external_metadata_store=True, @@ -62,6 +79,12 @@ def __init__(self) -> None: unsafe_mode=True, soft_assertions=True, sanity_restart=False, + support_external_clusterd=True, + # Allow creating an unmanaged cluster pointed at clusterd1 — without + # this, CREATE CLUSTER ... STORAGECTL ADDRESSES is rejected. + additional_system_parameter_defaults={ + "unsafe_enable_unorchestrated_cluster_replicas": "true", + }, ), Workload(), ] @@ -69,6 +92,13 @@ def __init__(self) -> None: def workflow_default(c: Composition) -> None: """Bring up the Antithesis test cluster.""" - c.up("postgres-metadata", "minio", "redpanda") + c.up( + "postgres-metadata", + "minio", + "zookeeper", + "kafka", + "schema-registry", + "clusterd1", + ) c.up("materialized") c.up("workload") diff --git a/test/antithesis/workload/workload-entrypoint.sh b/test/antithesis/workload/workload-entrypoint.sh index f37eb275ab1e7..e660a7904bb46 100755 --- a/test/antithesis/workload/workload-entrypoint.sh +++ b/test/antithesis/workload/workload-entrypoint.sh @@ -11,13 +11,44 @@ set -euo pipefail -# Wait for materialized to be ready before signaling setup_complete. +PGHOST="${PGHOST:-materialized}" +PGPORT="${PGPORT:-6875}" +PGUSER="${PGUSER:-materialize}" +PGPORT_INTERNAL="${PGPORT_INTERNAL:-6877}" +PGUSER_INTERNAL="${PGUSER_INTERNAL:-mz_system}" +CLUSTER="${MZ_ANTITHESIS_CLUSTER:-antithesis_cluster}" + +# Wait for materialized to be ready. echo "Waiting for materialized to become healthy..." until curl -sf http://materialized:6878/api/readyz > /dev/null 2>&1; do sleep 1 done echo "materialized is healthy." +# Provision an unmanaged cluster backed by the external clusterd1 process. +# This must run before setup-complete so Test Composer assertions can target +# the cluster from the start. Idempotent — `IF NOT EXISTS` is unsupported on +# `CREATE CLUSTER REPLICAS (...)`, so we query mz_clusters first. +existing=$( + psql -h "$PGHOST" -p "$PGPORT_INTERNAL" -U "$PGUSER_INTERNAL" -tAc \ + "SELECT 1 FROM mz_clusters WHERE name = '$CLUSTER'" +) +if [[ -z "$existing" ]]; then + echo "Provisioning cluster '$CLUSTER' against clusterd1..." + psql -h "$PGHOST" -p "$PGPORT_INTERNAL" -U "$PGUSER_INTERNAL" < Date: Mon, 11 May 2026 14:52:58 -0400 Subject: [PATCH 13/65] ci: regenerate antithesis compose YAML before build (avoid stale fingerprints) --- ci/nightly/pipeline.template.yml | 7 ++++++- ci/test/build-antithesis.sh | 33 ++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-) create mode 100755 ci/test/build-antithesis.sh diff --git a/ci/nightly/pipeline.template.yml b/ci/nightly/pipeline.template.yml index 065fbe0488b6f..b3c3068e04970 100644 --- a/ci/nightly/pipeline.template.yml +++ b/ci/nightly/pipeline.template.yml @@ -67,7 +67,12 @@ steps: - id: build-x86_64-antithesis label: ":rust: Build x86_64 (Antithesis)" - command: bin/ci-builder run stable bin/pyactivate -m ci.test.build + # Regenerate the antithesis compose YAML before building so the + # `antithesis-config` image's fingerprint captures the same + # materialized fingerprint we're about to publish — otherwise + # Antithesis would try to pull a stale `materialized:mzbuild-…` + # whenever the committed YAML lagged behind source changes. + command: bin/ci-builder run stable ci/test/build-antithesis.sh inputs: - "*" depends_on: [] diff --git a/ci/test/build-antithesis.sh b/ci/test/build-antithesis.sh new file mode 100755 index 0000000000000..0eb0788b89cc1 --- /dev/null +++ b/ci/test/build-antithesis.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash + +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. +# +# build-antithesis.sh — antithesis-flavored build entry point. +# +# Regenerates test/antithesis/config/docker-compose.yaml against the +# current source tree before invoking ci.test.build, so that the +# `antithesis-config` mzbuild image bakes in a compose YAML whose +# materialized/antithesis-workload fingerprints match the fingerprints +# this build is about to publish to GHCR. +# +# The committed YAML in test/antithesis/config/docker-compose.yaml is for +# human review (PR diffs); its fingerprints can drift on every materialized +# source change, and the staleness lint masks them by design. This script +# is what guarantees Antithesis sees a self-consistent compose. + +set -euo pipefail + +: "${CI_ANTITHESIS:?build-antithesis.sh expects CI_ANTITHESIS=1}" + +echo "--- Regenerating test/antithesis/config/docker-compose.yaml" +bin/pyactivate test/antithesis/export-compose.py \ + > test/antithesis/config/docker-compose.yaml + +exec bin/pyactivate -m ci.test.build From 2cfa6a3054eb1af5bf40b6f0fc921ae3ba66be8f Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Mon, 11 May 2026 14:56:16 -0400 Subject: [PATCH 14/65] test/antithesis: parameterize compose via .env (no more baked-in fingerprints) --- .gitignore | 2 + ci/test/build-antithesis.sh | 50 ++++++++---- .../checks/check-antithesis-compose.sh | 13 ++- test/antithesis/Makefile | 13 ++- test/antithesis/config/Dockerfile | 9 ++- test/antithesis/config/docker-compose.yaml | 6 +- test/antithesis/config/mzbuild.yml | 21 +++-- test/antithesis/export-compose.py | 59 +++++++------- test/antithesis/export-env.py | 81 +++++++++++++++++++ test/antithesis/push-antithesis.py | 79 ++++++++++++++++++ 10 files changed, 266 insertions(+), 67 deletions(-) create mode 100644 test/antithesis/export-env.py create mode 100755 test/antithesis/push-antithesis.py diff --git a/.gitignore b/.gitignore index 6eb7e16708d6f..58321fab14d4f 100644 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,8 @@ mzdata mzbuild __pycache__ +# Antithesis compose env file — generated by build-antithesis.sh / make build. +/test/antithesis/config/.env .mypy_cache venv node_modules diff --git a/ci/test/build-antithesis.sh b/ci/test/build-antithesis.sh index 0eb0788b89cc1..ef9d24d7c420c 100755 --- a/ci/test/build-antithesis.sh +++ b/ci/test/build-antithesis.sh @@ -9,25 +9,45 @@ # the Business Source License, use of this software will be governed # by the Apache License, Version 2.0. # -# build-antithesis.sh — antithesis-flavored build entry point. +# build-antithesis.sh — antithesis-flavored build + Antithesis-registry push. # -# Regenerates test/antithesis/config/docker-compose.yaml against the -# current source tree before invoking ci.test.build, so that the -# `antithesis-config` mzbuild image bakes in a compose YAML whose -# materialized/antithesis-workload fingerprints match the fingerprints -# this build is about to publish to GHCR. -# -# The committed YAML in test/antithesis/config/docker-compose.yaml is for -# human review (PR diffs); its fingerprints can drift on every materialized -# source change, and the staleness lint masks them by design. This script -# is what guarantees Antithesis sees a self-consistent compose. +# 1. Write `.env` so `antithesis-config` bakes in compose refs that point +# at the Antithesis GCP Artifact Registry (where we'll mirror to). The +# .env content is one of antithesis-config's mzbuild inputs, so the +# image fingerprint tracks the source it references — self-consistent. +# 2. Run the standard `ci.test.build` to compile antithesis-flavored Rust +# binaries and build the docker images (pushed to GHCR via mzbuild). +# 3. `docker login` the GCP Artifact Registry using +# `GCP_SERVICE_ACCOUNT_JSON` (already forwarded into ci-builder). +# 4. Retag + push `materialized`, `antithesis-workload`, and +# `antithesis-config` to the Antithesis registry. Public images +# referenced by the compose (postgres, minio, kafka stack) stay on +# their upstream registries — Antithesis can reach those directly. set -euo pipefail : "${CI_ANTITHESIS:?build-antithesis.sh expects CI_ANTITHESIS=1}" -echo "--- Regenerating test/antithesis/config/docker-compose.yaml" -bin/pyactivate test/antithesis/export-compose.py \ - > test/antithesis/config/docker-compose.yaml +# GCP Artifact Registry path for Antithesis. Tags pushed under +# $ANTITHESIS_REGISTRY/:mzbuild-. +ANTITHESIS_REGISTRY="${ANTITHESIS_REGISTRY:-us-central1-docker.pkg.dev/molten-verve-216720/materialize-repository}" + +echo "--- Writing test/antithesis/config/.env (registry: $ANTITHESIS_REGISTRY)" +bin/pyactivate test/antithesis/export-env.py \ + --registry "$ANTITHESIS_REGISTRY" \ + > test/antithesis/config/.env + +echo "--- Building antithesis-flavored mzbuild images" +bin/pyactivate -m ci.test.build + +echo "--- Authenticating to Antithesis registry" +if [[ -z "${GCP_SERVICE_ACCOUNT_JSON:-}" ]]; then + echo "GCP_SERVICE_ACCOUNT_JSON is unset — pushing to the Antithesis registry will fail." >&2 + echo "Provision it as a Buildkite-agent env var (see bin/ci-builder env-forwarding)." >&2 + exit 1 +fi +echo "$GCP_SERVICE_ACCOUNT_JSON" \ + | docker login -u _json_key --password-stdin "https://${ANTITHESIS_REGISTRY%%/*}" -exec bin/pyactivate -m ci.test.build +echo "--- Pushing Materialize-built images to the Antithesis registry" +bin/pyactivate test/antithesis/push-antithesis.py --registry "$ANTITHESIS_REGISTRY" diff --git a/ci/test/lint-main/checks/check-antithesis-compose.sh b/ci/test/lint-main/checks/check-antithesis-compose.sh index add2f2a0dab57..55c54f0bccfba 100755 --- a/ci/test/lint-main/checks/check-antithesis-compose.sh +++ b/ci/test/lint-main/checks/check-antithesis-compose.sh @@ -12,9 +12,10 @@ # check-antithesis-compose.sh — ensure test/antithesis/config/docker-compose.yaml # is in sync with test/antithesis/mzcompose.py. # -# Fingerprint refs (`mzbuild-`) shift on every materialized code change, -# so we mask them before diffing — we only want to catch composition -# (services, ports, env, deps) drift, not transient fingerprint churn. +# Image refs in the committed YAML are `${MATERIALIZED_IMAGE}` style +# placeholders (resolved from `.env` at compose-parse time), so the file is +# stable across materialized source changes. A plain diff catches any +# composition (services/ports/env/deps) drift. set -euo pipefail @@ -29,11 +30,7 @@ check_antithesis_compose() { bin/pyactivate test/antithesis/export-compose.py > "$generated" - # Mask `mzbuild-` so the diff is structural-only. - local mask='s/(mzbuild-)[A-Z0-9]+/\1FINGERPRINT/g' - if ! diff -u \ - <(sed -E "$mask" "$committed") \ - <(sed -E "$mask" "$generated"); then + if ! diff -u "$committed" "$generated"; then echo echo "$committed is out of sync with test/antithesis/mzcompose.py." echo "Regenerate with:" diff --git a/test/antithesis/Makefile b/test/antithesis/Makefile index f25077b582d33..878bf7e384019 100644 --- a/test/antithesis/Makefile +++ b/test/antithesis/Makefile @@ -41,24 +41,29 @@ ifeq ($(RUNTIME),podman) endif COMPOSE_FILE := $(REPO_ROOT)/test/antithesis/config/docker-compose.yaml -COMPOSE := $(RUNTIME) compose -p $(PROJECT) -f $(COMPOSE_FILE) +ENV_FILE := $(REPO_ROOT)/test/antithesis/config/.env +COMPOSE := $(RUNTIME) compose -p $(PROJECT) --env-file $(ENV_FILE) -f $(COMPOSE_FILE) PSQL := $(COMPOSE) exec materialized psql -h localhost -p 6875 -U materialize # mzbuild images we need built locally. Third-party images (postgres, minio, -# redpanda) are pulled by `docker compose` from their upstream registries. +# kafka, …) are pulled by `docker compose` from their upstream registries. MZBUILD_IMAGES := materialized antithesis-workload # --------------------------------------------------------------------------- # Build # --------------------------------------------------------------------------- -.PHONY: build export-compose acquire-images +.PHONY: build export-compose export-env acquire-images -build: export-compose acquire-images +build: export-compose export-env acquire-images export-compose: cd $(REPO_ROOT) && bin/pyactivate test/antithesis/export-compose.py > $(COMPOSE_FILE) @echo "Wrote $(COMPOSE_FILE)" +export-env: + cd $(REPO_ROOT) && bin/pyactivate test/antithesis/export-env.py > $(ENV_FILE) + @echo "Wrote $(ENV_FILE)" + acquire-images: @for image in $(MZBUILD_IMAGES); do \ echo "--- Acquiring $$image (--antithesis)"; \ diff --git a/test/antithesis/config/Dockerfile b/test/antithesis/config/Dockerfile index 386049db7e8e5..32fcb07e30460 100644 --- a/test/antithesis/config/Dockerfile +++ b/test/antithesis/config/Dockerfile @@ -7,9 +7,12 @@ # the Business Source License, use of this software will be governed # by the Apache License, Version 2.0. -# Antithesis config image: a FROM-scratch tarball holding the resolved +# Antithesis config image: a FROM-scratch tarball holding the # docker-compose.yaml that Antithesis uses to bring up the system under -# test. See mzbuild.yml for regeneration instructions. +# test, plus a `.env` mapping `${MATERIALIZED_IMAGE}` / +# `${ANTITHESIS_WORKLOAD_IMAGE}` to current mzbuild fingerprints. Compose +# loads `.env` automatically at parse time. See mzbuild.yml for +# regeneration instructions. FROM scratch -COPY docker-compose.yaml / +COPY docker-compose.yaml .env / diff --git a/test/antithesis/config/docker-compose.yaml b/test/antithesis/config/docker-compose.yaml index 26819190cd164..73291200a043c 100644 --- a/test/antithesis/config/docker-compose.yaml +++ b/test/antithesis/config/docker-compose.yaml @@ -209,7 +209,7 @@ services: restart: 'no' stop_grace_period: 120s platform: linux/amd64 - image: ghcr.io/materializeinc/materialize/materialized:mzbuild-IIJDLZ77L7R7ZOGPATZYUWRESDF5LDN7 + image: ${MATERIALIZED_IMAGE} materialized: hostname: materialized depends_on: @@ -397,7 +397,7 @@ services: start_period: 600s stop_grace_period: 120s platform: linux/amd64 - image: ghcr.io/materializeinc/materialize/materialized:mzbuild-IIJDLZ77L7R7ZOGPATZYUWRESDF5LDN7 + image: ${MATERIALIZED_IMAGE} workload: depends_on: materialized: @@ -418,7 +418,7 @@ services: - SCHEMA_REGISTRY_URL=http://schema-registry:8081 - MZ_ANTITHESIS_CLUSTER=antithesis_cluster platform: linux/amd64 - image: ghcr.io/materializeinc/materialize/antithesis-workload:mzbuild-XX2UEHO746TTSXP3JUOIMJTYD2WWEBLY + image: ${ANTITHESIS_WORKLOAD_IMAGE} networks: {} volumes: mzdata: null diff --git a/test/antithesis/config/mzbuild.yml b/test/antithesis/config/mzbuild.yml index 899d620d1285f..07011b460f407 100644 --- a/test/antithesis/config/mzbuild.yml +++ b/test/antithesis/config/mzbuild.yml @@ -7,13 +7,20 @@ # the Business Source License, use of this software will be governed # by the Apache License, Version 2.0. -# FROM-scratch image holding the resolved docker-compose.yaml for the -# Antithesis environment. Antithesis pulls this image and reads the compose -# spec from `/docker-compose.yaml` to bring up the system under test. +# FROM-scratch image holding the docker-compose.yaml + .env for the +# Antithesis environment. Antithesis pulls this image and reads +# `/docker-compose.yaml` to bring up the system under test; `.env` supplies +# `${MATERIALIZED_IMAGE}` / `${ANTITHESIS_WORKLOAD_IMAGE}` at compose-parse +# time. # -# The compose file is generated from test/antithesis/mzcompose.py via -# `bin/pyactivate test/antithesis/export-compose.py`. Re-run that whenever -# the composition topology changes; CI verifies the committed copy is up to -# date. +# The compose YAML (committed, topology-only) is generated from +# `test/antithesis/mzcompose.py` via `bin/pyactivate +# test/antithesis/export-compose.py`. Regenerate when topology changes; CI +# verifies the committed copy is up to date. +# +# `.env` (generated, gitignored) is written by +# `bin/pyactivate test/antithesis/export-env.py` at build time. Its content +# changes every materialized fingerprint shift, which is what propagates +# fresh fingerprints into this image without touching the committed YAML. name: antithesis-config diff --git a/test/antithesis/export-compose.py b/test/antithesis/export-compose.py index 081ce78ed41db..dcab7c16a2866 100644 --- a/test/antithesis/export-compose.py +++ b/test/antithesis/export-compose.py @@ -11,20 +11,24 @@ """Export the resolved docker-compose YAML for the Antithesis composition. -Loads `test/antithesis/mzcompose.py`, resolves every `mzbuild:` reference, -and dumps the resulting docker-compose dict to stdout. Antithesis pulls the -referenced images directly from public GHCR — no separate registry, no -re-tagging. +Loads `test/antithesis/mzcompose.py` and dumps a docker-compose YAML to +stdout where Materialize-built images are emitted as compose env-var +placeholders (`${MATERIALIZED_IMAGE}`, `${ANTITHESIS_WORKLOAD_IMAGE}`). +The actual fingerprint values are supplied separately in a `.env` file +generated by `export-env.py`. This separation lets the committed YAML stay +stable across materialized source changes — only `.env` shifts per +fingerprint. Image-reference policy: - * Materialize-built images (`materialized`, `antithesis-workload`) are - emitted as `ghcr.io/materializeinc/materialize/:mzbuild-`. - The fingerprint participates in `antithesis=True` so antithesis builds - don't collide with regular builds. + * Materialize-built images (`materialized`, `antithesis-workload`) + become `${MATERIALIZED_IMAGE}` / `${ANTITHESIS_WORKLOAD_IMAGE}`. + Compose interpolates them from `.env` at parse time. The actual specs + are `ghcr.io/materializeinc/materialize/:mzbuild-` with + `antithesis=True` participating in the fingerprint. - * Third-party `mzbuild` images (`postgres`, `minio`) are replaced with the - public upstream image. Our mzbuild variants bake in test-friendly + * Third-party `mzbuild` images (`postgres`, `minio`) are replaced with + the public upstream image. Our mzbuild variants bake in test-friendly patches (eatmydata, no_fsync) that defeat Antithesis's fault injection; Antithesis runs against vanilla. @@ -48,12 +52,18 @@ from materialize.mzbuild import Repository from materialize.mzcompose.composition import Composition -# mzbuild image names that we publish to GHCR and want Antithesis to pull -# under our fingerprint. Everything else falls back to a public image. -MATERIALIZE_IMAGES = {"materialized", "antithesis-workload"} +# mzbuild image names that we publish under our fingerprint. Each maps to +# the compose env-var placeholder; `.env` (export-env.py) supplies the +# concrete ref at compose-parse time. Keep in sync with `export-env.py`. +MATERIALIZE_IMAGES = { + "materialized": "${MATERIALIZED_IMAGE}", + "antithesis-workload": "${ANTITHESIS_WORKLOAD_IMAGE}", +} # Public-image fallbacks for mzbuild images whose Materialize-specific # customizations subvert Antithesis (eatmydata, fsync no-ops, etc.). +# Antithesis can reach public registries — we just need to make sure the +# compose points at the upstream image, not our patched mzbuild build. PUBLIC_FALLBACKS = { "postgres": "postgres:17.7", "minio": "minio/minio:latest", @@ -78,18 +88,18 @@ """ -def resolve_mzbuild(svc: dict[str, Any], deps: Any) -> None: - """Replace `mzbuild:` with a concrete `image:` ref.""" +def resolve_mzbuild(svc: dict[str, Any]) -> None: + """Replace `mzbuild:` with a concrete or templated `image:` ref.""" name = svc.pop("mzbuild") if name in MATERIALIZE_IMAGES: - svc["image"] = deps[name].spec() + svc["image"] = MATERIALIZE_IMAGES[name] elif name in PUBLIC_FALLBACKS: svc["image"] = PUBLIC_FALLBACKS[name] else: raise ValueError( f"mzbuild image {name!r} has no Antithesis policy — add it to " - f"MATERIALIZE_IMAGES (use our GHCR build) or PUBLIC_FALLBACKS " - f"(swap to a public image) in export-compose.py." + f"MATERIALIZE_IMAGES (use a `.env` placeholder) or " + f"PUBLIC_FALLBACKS (swap to a public image) in export-compose.py." ) @@ -181,21 +191,16 @@ def strip_mzcompose_keys(svc: dict[str, Any]) -> None: def main() -> None: # munge_services=False keeps ports bare (e.g., `6875` instead of # `127.0.0.1::6875`) — Antithesis is container-to-container, no host - # binding. We do our own mzbuild→image substitution below. + # binding. We do our own mzbuild→image substitution below and don't + # need fingerprint resolution since Materialize-built images become + # `${...}` placeholders. repo = Repository(Path("."), arch="x86_64", antithesis=True) c = Composition(repo, "antithesis", munge_services=False) - images = [ - repo.images[svc["mzbuild"]] - for svc in c.compose["services"].values() - if "mzbuild" in svc - ] - deps = repo.resolve_dependencies(images) - for svc in c.compose["services"].values(): svc["platform"] = "linux/amd64" if "mzbuild" in svc: - resolve_mzbuild(svc, deps) + resolve_mzbuild(svc) inline_postgres_setup(svc) strip_host_bindmounts(svc) strip_incompatible_env(svc) diff --git a/test/antithesis/export-env.py b/test/antithesis/export-env.py new file mode 100644 index 0000000000000..c1ea463d80e2e --- /dev/null +++ b/test/antithesis/export-env.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 + +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Emit the `.env` file consumed by Antithesis's docker-compose.yaml. + +The compose YAML (export-compose.py) is committed with `${MATERIALIZED_IMAGE}` +/ `${ANTITHESIS_WORKLOAD_IMAGE}` placeholders so it stays stable across +materialized source changes. This script writes the corresponding `.env` +with the current mzbuild fingerprints so compose can interpolate them. + +Run at CI build time (build-antithesis.sh) and at local-dev `make build`. +The `antithesis-config` mzbuild image copies in the .env produced by this +script, so the image's own fingerprint tracks the materialized fingerprint +transitively — same materialized → same .env → same antithesis-config. + +With `--registry`, the emitted refs use that registry prefix instead of +the default (whatever `spec()` returns based on `MZ_GHCR`). CI passes the +Antithesis GCP Artifact Registry so the compose Antithesis pulls +references images at the registry Antithesis can actually reach. + +Usage: + bin/pyactivate test/antithesis/export-env.py \\ + > test/antithesis/config/.env + bin/pyactivate test/antithesis/export-env.py \\ + --registry us-central1-docker.pkg.dev/molten-verve-216720/materialize-repository \\ + > test/antithesis/config/.env +""" + +import argparse +import sys +from pathlib import Path + +from materialize.mzbuild import Repository + +# Mapping of `.env` variable name → mzbuild image name. Keep in sync with +# MATERIALIZE_IMAGES in export-compose.py. +ENV_VARS = { + "MATERIALIZED_IMAGE": "materialized", + "ANTITHESIS_WORKLOAD_IMAGE": "antithesis-workload", +} + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__.splitlines()[0]) + parser.add_argument( + "--registry", + default=None, + help=( + "Registry prefix to use for emitted refs. If unset, uses the " + "default `spec()` (GHCR when MZ_GHCR=1, else Docker Hub)." + ), + ) + args = parser.parse_args() + + repo = Repository(Path("."), arch="x86_64", antithesis=True) + images = [repo.images[name] for name in ENV_VARS.values()] + deps = repo.resolve_dependencies(images) + + sys.stdout.write( + "# GENERATED FILE — do not edit. Regenerate via:\n" + "# bin/pyactivate test/antithesis/export-env.py > test/antithesis/config/.env\n" + "# Consumed by test/antithesis/config/docker-compose.yaml at compose-parse time.\n" + ) + for var, image_name in ENV_VARS.items(): + if args.registry: + ref = f"{args.registry}/{image_name}:mzbuild-{deps[image_name].fingerprint()}" + else: + ref = deps[image_name].spec() + sys.stdout.write(f"{var}={ref}\n") + + +if __name__ == "__main__": + main() diff --git a/test/antithesis/push-antithesis.py b/test/antithesis/push-antithesis.py new file mode 100755 index 0000000000000..83346214ac841 --- /dev/null +++ b/test/antithesis/push-antithesis.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 + +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Retag + push antithesis-flavored images to Antithesis's GCP registry. + +Antithesis's sandbox pulls images by reference. Our standard mzbuild flow +publishes to GHCR with `mzbuild-` tags, but new GHCR packages default +to private visibility — Antithesis hits a 4001 (image-not-reachable) when +trying to pull them. Pushing to a GCP Artifact Registry whose IAM grants +Antithesis read access avoids the visibility dance entirely. + +This script presumes `ci.test.build` has already run (so the source images +exist locally) and that `docker login` against the target registry has +already happened (build-antithesis.sh handles that via +GCP_SERVICE_ACCOUNT_JSON). + +Usage: + bin/pyactivate test/antithesis/push-antithesis.py \\ + --registry us-central1-docker.pkg.dev/molten-verve-216720/materialize-repository +""" + +import argparse +from pathlib import Path + +from materialize import spawn, ui +from materialize.mzbuild import Repository + +# Images Antithesis needs to be able to pull: +# - antithesis-config holds the docker-compose.yaml + .env Antithesis runs. +# - materialized + antithesis-workload are referenced by that compose. +ANTITHESIS_IMAGES = ["materialized", "antithesis-workload", "antithesis-config"] + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__.splitlines()[0]) + parser.add_argument( + "--registry", + required=True, + help="Antithesis registry prefix, e.g. us-central1-docker.pkg.dev/molten-verve-216720/materialize-repository", + ) + args = parser.parse_args() + + # Match the Repository configuration used by ci.test.build so that + # `deps[name].spec()` returns the same local tag that build actually + # produced (materialize/:mzbuild-, not the GHCR-prefixed one). + repo = Repository( + Path("."), + arch="x86_64", + antithesis=True, + image_registry="materialize", + ) + deps = repo.resolve_dependencies([repo.images[name] for name in ANTITHESIS_IMAGES]) + + # Ensure each image is actually present locally before retag — ci.test.build's + # `ensure()` path may short-circuit to "already pushed" without leaving a + # local copy if the fingerprint was already in the cache. + deps.acquire() + + for name in ANTITHESIS_IMAGES: + resolved = deps[name] + source = resolved.spec() + target = f"{args.registry}/{name}:mzbuild-{resolved.fingerprint()}" + ui.section(f"Pushing {name}") + print(f" source: {source}") + print(f" target: {target}") + spawn.runv(["docker", "tag", source, target]) + spawn.runv(["docker", "push", target]) + + +if __name__ == "__main__": + main() From d4373eb8bdb3d9bd615bf36bca6ed4a9c553031f Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Mon, 11 May 2026 15:20:37 -0400 Subject: [PATCH 15/65] ci: distinct ANTITHESIS_GCP_SERVICE_ACCOUNT_JSON for Antithesis registry push --- bin/ci-builder | 1 + ci/test/build-antithesis.sh | 13 ++++++++----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/bin/ci-builder b/bin/ci-builder index 0e81c806063d8..6d53be5cad2f5 100755 --- a/bin/ci-builder +++ b/bin/ci-builder @@ -281,6 +281,7 @@ case "$cmd" in --env AZURE_SERVICE_ACCOUNT_PASSWORD --env AZURE_SERVICE_ACCOUNT_TENANT --env GCP_SERVICE_ACCOUNT_JSON + --env ANTITHESIS_GCP_SERVICE_ACCOUNT_JSON --env GITHUB_TOKEN --env GITHUB_GHCR_TOKEN --env GPG_KEY diff --git a/ci/test/build-antithesis.sh b/ci/test/build-antithesis.sh index ef9d24d7c420c..23d9480ad8188 100755 --- a/ci/test/build-antithesis.sh +++ b/ci/test/build-antithesis.sh @@ -17,8 +17,11 @@ # image fingerprint tracks the source it references — self-consistent. # 2. Run the standard `ci.test.build` to compile antithesis-flavored Rust # binaries and build the docker images (pushed to GHCR via mzbuild). -# 3. `docker login` the GCP Artifact Registry using -# `GCP_SERVICE_ACCOUNT_JSON` (already forwarded into ci-builder). +# 3. `docker login` the Antithesis GCP Artifact Registry using +# `ANTITHESIS_GCP_SERVICE_ACCOUNT_JSON` (a service account scoped to +# `materialize-storage@molten-verve-216720.iam.gserviceaccount.com` — +# kept distinct from `GCP_SERVICE_ACCOUNT_JSON` which is used elsewhere +# for unrelated GCP integrations). # 4. Retag + push `materialized`, `antithesis-workload`, and # `antithesis-config` to the Antithesis registry. Public images # referenced by the compose (postgres, minio, kafka stack) stay on @@ -41,12 +44,12 @@ echo "--- Building antithesis-flavored mzbuild images" bin/pyactivate -m ci.test.build echo "--- Authenticating to Antithesis registry" -if [[ -z "${GCP_SERVICE_ACCOUNT_JSON:-}" ]]; then - echo "GCP_SERVICE_ACCOUNT_JSON is unset — pushing to the Antithesis registry will fail." >&2 +if [[ -z "${ANTITHESIS_GCP_SERVICE_ACCOUNT_JSON:-}" ]]; then + echo "ANTITHESIS_GCP_SERVICE_ACCOUNT_JSON is unset — pushing to the Antithesis registry will fail." >&2 echo "Provision it as a Buildkite-agent env var (see bin/ci-builder env-forwarding)." >&2 exit 1 fi -echo "$GCP_SERVICE_ACCOUNT_JSON" \ +echo "$ANTITHESIS_GCP_SERVICE_ACCOUNT_JSON" \ | docker login -u _json_key --password-stdin "https://${ANTITHESIS_REGISTRY%%/*}" echo "--- Pushing Materialize-built images to the Antithesis registry" From 3278bda7f8757b2c326f93bd77f318683f31acaa Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Mon, 11 May 2026 16:06:21 -0400 Subject: [PATCH 16/65] test/antithesis: mark antithesis-config publish:false + commit placeholder .env MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mzbuild's _build_locked runs `git clean -ffdX ` before each build, which wipes any gitignored file in the build context — including the .env we generate. Two fixes: 1. publish:false on antithesis-config so the standard ci.test.build flow skips it entirely on regular nightly builds (where .env never exists). Only build-antithesis.sh / push-antithesis.py builds this image, and they write .env first. 2. Commit a placeholder .env so the file is tracked (survives git clean) and participates in mzbuild's fingerprint computation. build-antithesis.sh overwrites it with real registry refs before the build runs; fingerprint reflects the overwritten content per build. --- .gitignore | 2 -- test/antithesis/config/.env | 21 +++++++++++++++++++++ test/antithesis/config/mzbuild.yml | 7 +++++++ 3 files changed, 28 insertions(+), 2 deletions(-) create mode 100644 test/antithesis/config/.env diff --git a/.gitignore b/.gitignore index 58321fab14d4f..6eb7e16708d6f 100644 --- a/.gitignore +++ b/.gitignore @@ -18,8 +18,6 @@ mzdata mzbuild __pycache__ -# Antithesis compose env file — generated by build-antithesis.sh / make build. -/test/antithesis/config/.env .mypy_cache venv node_modules diff --git a/test/antithesis/config/.env b/test/antithesis/config/.env new file mode 100644 index 0000000000000..d4f160a98596f --- /dev/null +++ b/test/antithesis/config/.env @@ -0,0 +1,21 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +# Compose env-file for `test/antithesis/config/docker-compose.yaml`. +# Tracked by git only so that the file exists for mzbuild's input +# fingerprinting and survives `git clean -ffdX` between builds. The +# committed values are placeholders — `build-antithesis.sh` overwrites +# them in CI with refs to images pushed to Antithesis's GCP Artifact +# Registry, and `make export-env` does the same with local-dev refs. +# +# If you see these placeholder values on a running cluster, your build +# pipeline did not regenerate this file. Run: +# bin/pyactivate test/antithesis/export-env.py > test/antithesis/config/.env +MATERIALIZED_IMAGE=placeholder-not-built +ANTITHESIS_WORKLOAD_IMAGE=placeholder-not-built diff --git a/test/antithesis/config/mzbuild.yml b/test/antithesis/config/mzbuild.yml index 07011b460f407..f3491f546dbb5 100644 --- a/test/antithesis/config/mzbuild.yml +++ b/test/antithesis/config/mzbuild.yml @@ -22,5 +22,12 @@ # `bin/pyactivate test/antithesis/export-env.py` at build time. Its content # changes every materialized fingerprint shift, which is what propagates # fresh fingerprints into this image without touching the committed YAML. +# +# `publish: false` keeps the standard `ci.test.build` flow from trying to +# build this image — it would fail on `COPY docker-compose.yaml .env /` +# because `.env` is gitignored and only `build-antithesis.sh` writes it. +# The antithesis nightly step builds and pushes the image directly via +# push-antithesis.py. name: antithesis-config +publish: false From 007c7af9d9970fb2030c7212368b232e0fbc363e Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Mon, 11 May 2026 16:12:25 -0400 Subject: [PATCH 17/65] test/antithesis: pass Arch enum to Repository, not string --- test/antithesis/export-compose.py | 3 ++- test/antithesis/export-env.py | 3 ++- test/antithesis/push-antithesis.py | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/test/antithesis/export-compose.py b/test/antithesis/export-compose.py index dcab7c16a2866..4e1fb5bece519 100644 --- a/test/antithesis/export-compose.py +++ b/test/antithesis/export-compose.py @@ -51,6 +51,7 @@ from materialize import MZ_ROOT from materialize.mzbuild import Repository from materialize.mzcompose.composition import Composition +from materialize.xcompile import Arch # mzbuild image names that we publish under our fingerprint. Each maps to # the compose env-var placeholder; `.env` (export-env.py) supplies the @@ -194,7 +195,7 @@ def main() -> None: # binding. We do our own mzbuild→image substitution below and don't # need fingerprint resolution since Materialize-built images become # `${...}` placeholders. - repo = Repository(Path("."), arch="x86_64", antithesis=True) + repo = Repository(Path("."), arch=Arch.X86_64, antithesis=True) c = Composition(repo, "antithesis", munge_services=False) for svc in c.compose["services"].values(): diff --git a/test/antithesis/export-env.py b/test/antithesis/export-env.py index c1ea463d80e2e..043c912cf1608 100644 --- a/test/antithesis/export-env.py +++ b/test/antithesis/export-env.py @@ -39,6 +39,7 @@ from pathlib import Path from materialize.mzbuild import Repository +from materialize.xcompile import Arch # Mapping of `.env` variable name → mzbuild image name. Keep in sync with # MATERIALIZE_IMAGES in export-compose.py. @@ -60,7 +61,7 @@ def main() -> None: ) args = parser.parse_args() - repo = Repository(Path("."), arch="x86_64", antithesis=True) + repo = Repository(Path("."), arch=Arch.X86_64, antithesis=True) images = [repo.images[name] for name in ENV_VARS.values()] deps = repo.resolve_dependencies(images) diff --git a/test/antithesis/push-antithesis.py b/test/antithesis/push-antithesis.py index 83346214ac841..2787f5cee8f30 100755 --- a/test/antithesis/push-antithesis.py +++ b/test/antithesis/push-antithesis.py @@ -32,6 +32,7 @@ from materialize import spawn, ui from materialize.mzbuild import Repository +from materialize.xcompile import Arch # Images Antithesis needs to be able to pull: # - antithesis-config holds the docker-compose.yaml + .env Antithesis runs. @@ -53,7 +54,7 @@ def main() -> None: # produced (materialize/:mzbuild-, not the GHCR-prefixed one). repo = Repository( Path("."), - arch="x86_64", + arch=Arch.X86_64, antithesis=True, image_registry="materialize", ) From 8e459cdf56d46466fdcf0ba435ff40324e047c1a Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Mon, 11 May 2026 16:38:44 -0400 Subject: [PATCH 18/65] test/antithesis: kafka source property catalog + first workload property Add 16 Antithesis properties for Kafka source ingestion (NONE + UPSERT envelopes) to the scratchbook, plus the workload-side implementation of upsert-key-reflects-latest-value. Scratchbook additions: - sut-analysis Appendix A: kafka source pipeline detail - existing-assertions: enumerated SUT-side panic/assert sites that are candidates for Antithesis SDK instrumentation - property-catalog Category 7: 16 new Kafka/UPSERT properties - property-relationships clusters 7-10 plus cross-cluster connections - 16 per-property evidence files - evaluation/synthesis.md: four-lens review Workload: - parallel_driver_upsert_latest_value.py: produces upserts+tombstones with deterministic randomness, requests a quiet period, polls mz_source_statistics for catchup, and asserts per-key value match (two always() assertions + one sometimes() liveness anchor). - helper_pg / helper_kafka / helper_quiet / helper_random / helper_source_stats / helper_upsert_source: shared utilities for subsequent Kafka source properties. --- test/antithesis/export-env.py | 4 +- .../scratchbook/evaluation/synthesis.md | 81 +++++++ .../scratchbook/existing-assertions.md | 51 +++++ .../kafka-source-frontier-monotonic.md | 40 ++++ .../kafka-source-no-data-duplication.md | 44 ++++ .../properties/kafka-source-no-data-loss.md | 42 ++++ .../kafka-source-no-internal-panic.md | 44 ++++ .../kafka-source-survives-broker-fault.md | 40 ++++ .../kafka-source-survives-clusterd-restart.md | 47 ++++ .../offset-known-not-below-committed.md | 39 ++++ .../reclock-mint-eventually-succeeds.md | 61 +++++ .../remap-shard-antichain-wellformed.md | 55 +++++ .../upsert-decode-error-retractable.md | 42 ++++ ...ert-ensure-decoded-called-before-access.md | 43 ++++ .../upsert-key-reflects-latest-value.md | 63 ++++++ .../properties/upsert-no-internal-panic.md | 43 ++++ .../upsert-state-consolidation-wellformed.md | 75 +++++++ .../upsert-state-rehydrates-correctly.md | 46 ++++ .../upsert-tombstone-removes-key.md | 38 ++++ .../scratchbook/property-catalog.md | 197 +++++++++++++++- .../scratchbook/property-relationships.md | 39 ++++ test/antithesis/scratchbook/sut-analysis.md | 81 +++++++ test/antithesis/workload/test/helper_kafka.py | 90 ++++++++ test/antithesis/workload/test/helper_pg.py | 120 ++++++++++ test/antithesis/workload/test/helper_quiet.py | 38 ++++ .../antithesis/workload/test/helper_random.py | 64 ++++++ .../workload/test/helper_source_stats.py | 86 +++++++ .../workload/test/helper_upsert_source.py | 54 +++++ .../parallel_driver_upsert_latest_value.py | 211 ++++++++++++++++++ 29 files changed, 1869 insertions(+), 9 deletions(-) create mode 100644 test/antithesis/scratchbook/evaluation/synthesis.md create mode 100644 test/antithesis/scratchbook/properties/kafka-source-frontier-monotonic.md create mode 100644 test/antithesis/scratchbook/properties/kafka-source-no-data-duplication.md create mode 100644 test/antithesis/scratchbook/properties/kafka-source-no-data-loss.md create mode 100644 test/antithesis/scratchbook/properties/kafka-source-no-internal-panic.md create mode 100644 test/antithesis/scratchbook/properties/kafka-source-survives-broker-fault.md create mode 100644 test/antithesis/scratchbook/properties/kafka-source-survives-clusterd-restart.md create mode 100644 test/antithesis/scratchbook/properties/offset-known-not-below-committed.md create mode 100644 test/antithesis/scratchbook/properties/reclock-mint-eventually-succeeds.md create mode 100644 test/antithesis/scratchbook/properties/remap-shard-antichain-wellformed.md create mode 100644 test/antithesis/scratchbook/properties/upsert-decode-error-retractable.md create mode 100644 test/antithesis/scratchbook/properties/upsert-ensure-decoded-called-before-access.md create mode 100644 test/antithesis/scratchbook/properties/upsert-key-reflects-latest-value.md create mode 100644 test/antithesis/scratchbook/properties/upsert-no-internal-panic.md create mode 100644 test/antithesis/scratchbook/properties/upsert-state-consolidation-wellformed.md create mode 100644 test/antithesis/scratchbook/properties/upsert-state-rehydrates-correctly.md create mode 100644 test/antithesis/scratchbook/properties/upsert-tombstone-removes-key.md create mode 100644 test/antithesis/workload/test/helper_kafka.py create mode 100644 test/antithesis/workload/test/helper_pg.py create mode 100644 test/antithesis/workload/test/helper_quiet.py create mode 100644 test/antithesis/workload/test/helper_random.py create mode 100644 test/antithesis/workload/test/helper_source_stats.py create mode 100644 test/antithesis/workload/test/helper_upsert_source.py create mode 100755 test/antithesis/workload/test/parallel_driver_upsert_latest_value.py diff --git a/test/antithesis/export-env.py b/test/antithesis/export-env.py index 043c912cf1608..5488a0f097673 100644 --- a/test/antithesis/export-env.py +++ b/test/antithesis/export-env.py @@ -72,7 +72,9 @@ def main() -> None: ) for var, image_name in ENV_VARS.items(): if args.registry: - ref = f"{args.registry}/{image_name}:mzbuild-{deps[image_name].fingerprint()}" + ref = ( + f"{args.registry}/{image_name}:mzbuild-{deps[image_name].fingerprint()}" + ) else: ref = deps[image_name].spec() sys.stdout.write(f"{var}={ref}\n") diff --git a/test/antithesis/scratchbook/evaluation/synthesis.md b/test/antithesis/scratchbook/evaluation/synthesis.md new file mode 100644 index 0000000000000..fff919f61edf2 --- /dev/null +++ b/test/antithesis/scratchbook/evaluation/synthesis.md @@ -0,0 +1,81 @@ +# Property Catalog Evaluation — Kafka Source Additions + +**Scope**: The 16 properties added to Category 7 in `property-catalog.md` on 2026-05-11 targeting the Kafka source ingestion pipeline (NONE + UPSERT envelopes), and the assertion sites in `existing-assertions.md`. Pre-existing properties in Categories 1-6 are *not* re-evaluated here — they passed evaluation on 2026-05-06 and nothing has changed in their code paths. The 16 are: 5 user-visible Kafka source properties (`kafka-source-no-data-loss`, `-no-data-duplication`, `-frontier-monotonic`, `-survives-broker-fault`, `-survives-clusterd-restart`), 4 UPSERT envelope properties (`upsert-key-reflects-latest-value`, `-tombstone-removes-key`, `-state-rehydrates-correctly`, `-decode-error-retractable`), 3 UPSERT operator-internal properties (`upsert-no-internal-panic`, `-state-consolidation-wellformed`, `-ensure-decoded-called-before-access`), and 4 reclock / source-reader operator-internal properties (`kafka-source-no-internal-panic`, `remap-shard-antichain-wellformed`, `reclock-mint-eventually-succeeds`, `offset-known-not-below-committed`). + +This evaluation was performed in single-agent mode across the four lenses, written as a single synthesis. Per-lens evidence files are inline below; spawning four parallel ensemble agents for a 16-property targeted addition would have been over-engineering given that one human's worth of catalog review is the better fit. + +## Lens 1 — Antithesis Fit + +**Passes**: + +- All 16 properties target timing-sensitive, concurrency-sensitive, or partial-failure scenarios. None can be fully verified by a deterministic unit test. +- Mix of assertion types is healthy: 7 Safety (`Always`), 3 Liveness (`Sometimes`), 3 Reachability (`Unreachable`), 2 properties combine multiple assertion families internally. +- Several properties (`kafka-source-survives-clusterd-restart`, `upsert-state-rehydrates-correctly`, `reclock-mint-eventually-succeeds`) explicitly need fault injection that deterministic tests can't sequence — strong Antithesis fit. +- The SUT-side instrumentation properties (`upsert-no-internal-panic`, `upsert-state-consolidation-wellformed`, `upsert-ensure-decoded-called-before-access`, `kafka-source-no-internal-panic`) wrap *existing* asserts/panics rather than adding new logic; this is the cheapest possible instrumentation cost. + +**Refinements**: + +- `offset-known-not-below-committed` is borderline unit-test material — the invariant could be tested by mocking the statistics update path. Kept in the catalog because the *interesting* failure is the restart-window timing, which is genuinely Antithesis territory; lowered priority from P1 to P2 (already P2 in the catalog). +- `upsert-decode-error-retractable` could be tested as integration. It earns its catalog slot only if the test exercises crash recovery between the bad and good message; the evidence file already calls this out. No change needed. + +**Findings**: None. Antithesis fit is good across the addition. + +## Lens 2 — Coverage Balance + +**Passes**: + +- Both envelopes (NONE and UPSERT) get dedicated coverage. +- The SUT analysis's Appendix A failure-prone areas table has 9 rows; 8 of them are covered by at least one new property. The one uncovered row is "Flag flip mid-append on persist sink (commit 68e1dfd86d)" — see Gap below. +- Liveness, Safety, and Reachability are all represented. +- Both workload-observable and SUT-side properties exist; the workload-only properties form the user-visible contract (`kafka-source-no-data-loss`, etc.) and the SUT-side properties form the operator-internal correctness backbone. + +**Gaps identified** (addressed during this pass — see "Addressing findings" below): + +- **G1: Persist sink flag-flip TOCTOU** — commit `68e1dfd86d` (database-issues#9585) regression is not represented. The bug was a config flag re-evaluated multiple times during `append_batches`. Decision: **Acknowledged but not added**. This is a persist-sink generic correctness property, not Kafka-source-specific; it belongs in Category 1 (Persist Layer Safety), not in the Kafka section. Filing as a follow-up note in `property-relationships.md` would clutter the relationships; instead, called out here as a known omission for a future persist-focused research pass. + +- **G2: Partition reassignment correctness** — Kafka topic adding/removing partitions while the source is live is mentioned in the SUT analysis but not captured as a property. The closest is `kafka-source-no-internal-panic` which catches *panics* on the rebalance path but not *correctness* (no data loss, no duplicates, correct partition→worker assignment under rebalance). Decision: **Catalog as a future expansion item**, not added in this pass because it requires non-trivial workload support (the test driver must be able to dynamically add Kafka partitions, and the worker-hash assignment property requires multi-worker clusterd). + +- **G3: Schema Registry interaction** — Avro / Protobuf decoding via Schema Registry is a significant Kafka source code path that is unmentioned. Schema evolution mid-source is a known operational hazard. Decision: **Future expansion item**. The workload is realistically text/JSON for v1 of these properties; Schema Registry coverage is a v2 expansion. + +**Refinements**: + +- The pre-existing `source-ingestion-progress` property is now redundant with `kafka-source-no-data-loss` for Kafka specifically. The relationships file calls this out. Decision: **Keep both** — `source-ingestion-progress` remains valid for non-Kafka sources (Postgres CDC, MySQL, generators), so it doesn't go away. The new property is more specific. No catalog edit needed beyond the cross-reference in `property-relationships.md`. + +## Lens 3 — Implementability + +**Passes**: + +- All workload-level properties can be checked via standard SQL queries against `mz_internal.mz_source_statistics_per_worker` and direct `SELECT` from the source. The workload only needs a PostgreSQL client and a Kafka producer (both already required by the existing topology in `deployment-topology.md`). +- All SUT-side properties wrap *existing* code (panic / assert / unreachable sites). No new SUT instrumentation logic is required, only replacing the existing macro with the Antithesis SDK equivalent and giving each callsite a unique message. +- Deployment topology already provides Kafka (Redpanda) and `materialized` in separate containers; network partition between them is a supported fault. +- Multi-replica scenarios for `upsert-state-consolidation-wellformed` and the upsert internals require a topology variation (multiple compute replicas serving the same source). The existing topology is single-replica; this is flagged. + +**Refinements**: + +- `kafka-source-survives-clusterd-restart` requires **node-termination faults**, which the `faults.md` reference says are disabled by default in Antithesis tenants. Flagged in the evidence file. The user should confirm this fault class is enabled. +- `upsert-state-consolidation-wellformed` (and `kafka-source-no-data-duplication` for the historical multi-replica regression) gain significant value from a multi-replica topology. Suggest adding a second topology variant to `deployment-topology.md` as a follow-up — single-replica is sufficient to start, but the multi-replica drain bug (commit `1accbe28b3`) requires multi-replica to reproduce. + +**Findings (refinements applied or noted in evidence files)**: + +- R1: Added a note to `properties/kafka-source-survives-clusterd-restart.md` calling out the node-termination-faults dependency. +- R2: Added a note to `properties/upsert-state-consolidation-wellformed.md` explaining the multi-replica relevance. + +## Lens 4 — Wildcard + +**Things the other lenses missed**: + +- **W1: Multi-topic / multi-source interaction.** The 16 properties all treat a single Kafka source as the unit of analysis. The real-world failure mode of "two Kafka sources on the same cluster, one is healthy, the other is partitioned" is unaddressed. The `materialized` container hosts both; partitioning one source from its broker should not affect the other. Decision: **Future expansion**. Adding this now would expand the workload significantly. + +- **W2: Clock-jump interaction with Kafka timestamps.** The SUT analysis flags `expect("kafka sources always have upstream_time")` at kafka.rs:1209 — this depends on the Kafka message timestamp being valid. Clock jumps on the *Kafka broker* could produce future or past message timestamps. The current property set doesn't address how Materialize handles a backward-clocked Kafka broker. Decision: **Acknowledged as a known gap**, similar to W1. + +- **W3: Reading the catalog as a whole, the SUT-side instrumentation properties feel like a single "wrap all the existing panics in Antithesis SDK" project rather than four separate properties.** Decision: **Keep the four-property structure** anyway, because the slugs give Antithesis distinct property tags and the per-site message uniqueness requirement makes them genuinely distinct invariants. But operationally, a single PR can implement all four. + +## Addressing Findings + +- **Refinements applied**: R1, R2 (noted in evidence files during this pass). +- **Gaps held as known omissions**: G1 (persist-sink flag flip — belongs in Category 1), G2 (partition reassignment — needs workload extension), G3 (schema registry — v2 expansion), W1 (multi-source interaction), W2 (clock jumps on broker). +- **Biases escalated to user**: None — the catalog framing matches the user's stated scope ("basic properties for Kafka sources, both normal and upsert workloads"). The "basic" qualifier explicitly suggests that some areas like partition reassignment, schema registry, and multi-source scenarios are intentionally deferred to future passes. + +## Conclusion + +The 16-property Kafka source addition is implementable, well-scoped to Antithesis's strengths, and covers both envelopes plus the shared reclock layer. Known gaps are documented above as follow-up candidates. No biases escalated; the user's "basic" framing aligns with the catalog scope. diff --git a/test/antithesis/scratchbook/existing-assertions.md b/test/antithesis/scratchbook/existing-assertions.md index 8e423c26a0415..592d71d368c15 100644 --- a/test/antithesis/scratchbook/existing-assertions.md +++ b/test/antithesis/scratchbook/existing-assertions.md @@ -35,3 +35,54 @@ Full Kubernetes topology: environmentd StatefulSet, postgres StatefulSet, redpan ## Implications for New Work All property assertions will need to be added fresh. The existing integration provides a starting point for topology but uses an older approach (experiment scripts, custom instrumented images). The new approach should leverage mzcompose for compose generation and add Antithesis SDK assertions either in the workload client or (for deeper coverage) in the Materialize Rust source. + +## Storage/Kafka/UPSERT Path — Candidate Instrumentation Sites + +Added 2026-05-11 during Kafka-source property discovery. These are existing `panic!`/`assert!`/`unreachable!` sites in the storage code that are direct candidates for being wrapped with the Antithesis SDK so that violations surface as reportable property failures rather than process aborts. Confirmed by grepping the source at commit `007c7af9d9970fb2030c7212368b232e0fbc363e`. + +### `src/storage/src/source/kafka.rs` + +- `:158` — `expect("positive pid")` +- `:265` — `expect("all source exports must be present in source resume uppers")` +- `:276` — `panic!("unexpected source export details: {:?}", details)` +- `:282` — `expect("statistics have been initialized")` +- `:345` — `expect("restored kafka offsets must fit into i64")` +- `:606, :853, :855, :891, :894, :897, :903, :907, :997` — various `expect()` and `assert!()` on reader state +- `:1142-1147` — `assert!(self.last_offsets[output_index].contains_key(&partition))` +- `:1193-1197` — `panic!("got negative offset (...) from otherwise non-error'd kafka message")` +- `:1209` — `expect("kafka sources always have upstream_time")` +- `:1457` — `assert!(…)` on payload structure + +### `src/storage/src/source/reclock.rs` and `reclock/compat.rs` + +- `reclock.rs:124` — `assert!(!new_into_upper.less_equal(&binding_ts))` +- `reclock.rs:321` — `assert!(prev < RB::before(pid))` +- `reclock/compat.rs:144` — `assert!(…)` on persist handle state +- `reclock/compat.rs:306` — `panic!("compare_and_append failed: {invalid_use}")` + +### `src/storage/src/upsert.rs` + +- `:541` — `assert!(diff.is_positive(), "invalid upsert input")` +- `:636` — `panic!("key missing from commands_state")` +- `:1031` — `unreachable!("pending future never returns")` + +### `src/storage/src/upsert_continual_feedback.rs` + +- `:626` — `assert!(diff.is_positive(), "invalid upsert input")` +- `:800` — `panic!("key missing from commands_state")` + +### `src/storage/src/upsert_continual_feedback_v2.rs` + +- `:315` — `assert!(diff.is_positive(), "invalid upsert input")` +- `:483` — `unreachable!()` on `(None, None)` from joined prior/new state + +### `src/storage/src/upsert/types.rs` — `StateValue` and `ensure_decoded` + +- `:297, :369, :403, :416, :430, :440` — six `panic!("called \`\` without calling \`ensure_decoded\`")` sites (`into_decoded`, `into_provisional_value`, `into_provisional_tombstone`, `provisional_order`, `provisional_value_ref`, `into_finalized_value`) +- `:580` — `panic!("\`merge_update_state\` called with non-consolidating state")` +- `:621` — `assert_eq!(checksum_sum.0, seahash::hash(value) as i64, …)` inside `ensure_decoded` (diff_sum == 1) +- `:632, :637, :642` — three checks for `diff_sum == 0` (`len_sum`, `checksum_sum`, all-zero `value_xor`) +- `:672` — `panic!("invalid upsert state: non 0/1 diff_sum: …")` +- `:1062` — `panic!("attempted completion of already completed upsert snapshot")` + +Per the property catalog, each of these gets a *distinct, specific* Antithesis assertion message so a fired assertion names exactly the site reached. No site shares a message with another. See `properties/upsert-no-internal-panic.md`, `properties/upsert-state-consolidation-wellformed.md`, `properties/upsert-ensure-decoded-called-before-access.md`, and `properties/kafka-source-no-internal-panic.md` for the per-site rename table. diff --git a/test/antithesis/scratchbook/properties/kafka-source-frontier-monotonic.md b/test/antithesis/scratchbook/properties/kafka-source-frontier-monotonic.md new file mode 100644 index 0000000000000..03f551e5cbd9f --- /dev/null +++ b/test/antithesis/scratchbook/properties/kafka-source-frontier-monotonic.md @@ -0,0 +1,40 @@ +# kafka-source-frontier-monotonic + +## Summary + +The `upper` frontier of the source's data persist shard never regresses across the source's lifetime, including across clusterd restarts and `compare_and_append` retries. + +## Code paths + +- `src/storage/src/render/persist_sink.rs` — `append_batches` calls `WriteHandle::compare_and_append`. Cached upper is the failure-prone spot (commit `505dc96aaa`: cached upper went stale under concurrent writers; fix uses `fetch_recent_upper`). +- `src/storage/src/source/reclock.rs` — `ReclockOperator::sync`: must not let the operator's `upper` field regress across `compare_and_append` retries. +- `src/storage/src/source/reclock/compat.rs:306` — `panic!("compare_and_append failed: {invalid_use}")`: this is the assertion that catches genuinely invalid persist calls (vs. legitimate `UpperMismatch` which is retried). + +## How to check it + +- Workload polls `mz_internal.mz_source_statistics_per_worker.offset_committed` (or equivalent shard upper view) on a tight cadence and `assert_always!(upper_monotonic, "kafka: source shard upper non-monotonic")` whenever a new sample is `< previous sample`. +- SUT-side: in `append_batches`, immediately before `compare_and_append`, capture the previous upper from the local cached state and `assert_always!(new_upper >= prev_upper, "persist sink: upper regression on append")`. Distinct messages on the reclock side. + +## What goes wrong on violation + +Downstream operators panic when `as_of > upper` (the reclock-`as_of` race in commit `e3805ad790`, database-issues#8698, was exactly this shape). `AS OF` SQL queries return wrong results. + +## Antithesis angle + +- Kill clusterd mid-`compare_and_append`. On restart, the cached upper must be refreshed before the next append. +- Concurrent reclock writers (two storage workers racing during a transient split-brain): both attempt CaS; only one wins; the other's local upper must catch up before it tries again. +- Inject persist consensus latency to widen the cache-staleness window. + +## Open question (resolved) + +Q: Does the reclock retry loop in `ReclockOperator::mint` (reclock.rs:160-166) protect against this, or is the bug in code that doesn't go through `sync`? + +A: The retry loop does protect — but only if `sync()` is called *before* the local upper is used in subsequent code. The historical bug (`e3805ad790`) was in the `as_of` computation path which ran *outside* `mint` and used a cached upper from the read handle. Workload-level monotonicity assertion is sufficient to catch both paths. + +## Existing instrumentation + +None. The persist-side `panic!("compare_and_append failed: …")` in `reclock/compat.rs:306` is informational, not a property. Wrap with `assert_unreachable!` for the genuinely-invalid case and add an `assert_always!` for the workload-observable monotonicity. + +## Provenance + +Surfaced by: Data Integrity, Distributed Coordination. Direct regression target for commits `e3805ad790` and `505dc96aaa`. diff --git a/test/antithesis/scratchbook/properties/kafka-source-no-data-duplication.md b/test/antithesis/scratchbook/properties/kafka-source-no-data-duplication.md new file mode 100644 index 0000000000000..fba0e8348808f --- /dev/null +++ b/test/antithesis/scratchbook/properties/kafka-source-no-data-duplication.md @@ -0,0 +1,44 @@ +# kafka-source-no-data-duplication + +## Summary + +After settling, the source contains no duplicates — at most one row per `(partition, offset)` for NONE-envelope and at most one row per key for UPSERT-envelope. + +## Why this property + +Duplication is the symmetric failure mode to `kafka-source-no-data-loss`. It is silent, propagates into every downstream aggregate, and historically arose in the upsert operator under multi-replica drain (commit `1accbe28b3`, database-issues#9160). It is the more dangerous of the two failure modes because it is harder to detect operationally — the workload sees "extra" rows that look plausible. + +## Code paths + +- `src/storage/src/source/kafka.rs:1158` — per-incarnation dedup against `last_offsets` (drops messages with offset `<= last_offset`). Per-incarnation only; does not survive restart. +- `src/storage/src/render/persist_sink.rs` — the persist sink is responsible for ensuring writes are idempotent across restarts. Compare-and-append with idempotency tokens on retry handles the indeterminate-error case (compare with `idempotent-write-under-indeterminate`). +- `src/storage/src/upsert_continual_feedback.rs` — `drain_staged_input`: the regression target for commit `1accbe28b3`. Single-replica clusters masked the bug because capabilities were always singletons; multi-replica drained the same staged input twice. +- `src/storage/src/upsert.rs:541`, `upsert_continual_feedback*.rs` — `assert!(diff.is_positive(), "invalid upsert input")`. Retractions on the input would be the canonical "duplicate retraction" symptom. + +## How to check it + +Workload-level: +- NONE envelope: `SELECT partition, "offset", COUNT(*) FROM source GROUP BY 1, 2 HAVING COUNT(*) > 1` returns 0 rows. Assert with `assert_always!(no_dupes, "kafka source: no duplicate (partition, offset)")`. +- UPSERT envelope: `SELECT key, COUNT(*) FROM source GROUP BY 1 HAVING COUNT(*) > 1` returns 0 rows. Same assertion shape with a unique message. + +These run on every check fire, ideally on a polling cadence, not just at end-of-test. + +SUT-side: convert the existing `assert!(diff.is_positive(), "invalid upsert input")` into `assert_always!(diff.is_positive(), "upsert: input diff positive")` so a duplicate retraction surfaces as a property failure rather than a process abort. Distinct messages at each of the three callsites. + +## What goes wrong on violation + +Aggregates over the source double-count. Joins fan out. Downstream MVs become wrong in ways that are hard to attribute to ingestion. + +## Antithesis angle + +- Crash storage worker between `write_batches` and `append_batches`. Restart and verify that no `(partition, offset)` appears twice in the resulting persist shard. +- For UPSERT: multi-replica cluster topology (the historical bug requires it). Run two replicas on the same source and observe the persisted output for duplicate retractions. +- Race the upsert feedback-driven snapshot replay against new input. + +## Existing instrumentation + +The runtime `assert!` in upsert.rs already aborts on negative input diffs — it just doesn't surface as an Antithesis property. Wrapping each callsite with `assert_always!` (per-site unique message) gives Antithesis the signal it needs without changing semantics outside Antithesis (the underlying `assert!` already aborts on violation). + +## Provenance + +Surfaced by: Data Integrity, Concurrency, Failure Recovery. Direct regression target for database-issues#9160. diff --git a/test/antithesis/scratchbook/properties/kafka-source-no-data-loss.md b/test/antithesis/scratchbook/properties/kafka-source-no-data-loss.md new file mode 100644 index 0000000000000..2a451a32d4312 --- /dev/null +++ b/test/antithesis/scratchbook/properties/kafka-source-no-data-loss.md @@ -0,0 +1,42 @@ +# kafka-source-no-data-loss + +## Summary + +Every Kafka message produced by the workload is eventually visible in the source — either as a row (NONE envelope) or as the latest value for its key (UPSERT envelope). + +## Why this property + +This is the headline guarantee of a streaming database. The previous catalog entry `source-ingestion-progress` covered the generic "frontier advances" liveness signal; this property is the Kafka-specific, workload-checkable version that compares produced records against `SELECT` output. + +## Code paths + +- `src/storage/src/source/kafka.rs` — `render_reader`: the reader loop that drains `PartitionQueue`s, deduplicates against `last_offsets`, and emits `(SourceMessage, KafkaTimestamp, +1)` triples. +- `src/storage/src/source/source_reader_pipeline.rs` — `create_raw_source`: assembles reader, remap, reclock. +- `src/storage/src/source/reclock.rs` — `ReclockOperator::mint`: binds source timestamps to Materialize timestamps and persists the binding via `compare_and_append` on the remap shard. +- `src/storage/src/render/persist_sink.rs` — `mint_batch_descriptions` → `write_batches` → `append_batches`: the path that actually puts rows into the source's data persist shard. +- For UPSERT: `src/storage/src/upsert.rs` (`upsert_classic`) and the continual-feedback variants in `upsert_continual_feedback*.rs`. + +## How to check it + +Workload-level: +1. The workload tracks every `(topic, partition, offset, key, value)` it produces. +2. After produce settles, the workload calls `ANTITHESIS_STOP_FAULTS` and waits for `mz_internal.mz_source_statistics_per_worker` to report `offset_committed >= max_produced_offset`. +3. The workload asserts via `assert_sometimes!("kafka source caught up to produced offsets", expected_rowcount_visible)` that `COUNT(*) FROM source >= produced_count` (NONE) or that the per-key latest-value model matches the source (UPSERT). + +SUT-side anchor: `assert_sometimes!(persist_sink_appended_batch)` inside `append_batches` after the first successful `compare_and_append` for this source. + +## What goes wrong on violation + +Silent data loss: the source ingests fewer rows than were produced; the workload sees a stall that doesn't resolve even with faults paused. Downstream MVs see incomplete data. + +## Antithesis angle + +The interesting window is mid-batch crash: a clusterd kill between the persist sink's `write_batches` (which uploads parts) and `append_batches` (which compare-and-appends). The resume frontier on restart determines what gets re-read. Bugs here look like: wrong resume offset (commit history: kafka.rs:1158 dedup is per-incarnation only — across restart, idempotency depends on persist-sink correctness). + +## Existing instrumentation + +None. No `assert_sometimes!` in the source path today (verified against `existing-assertions.md`). To implement: add an `assert_sometimes!` in the persist sink's `append_batches` after a successful append, plus a workload-side `assert_sometimes!` after the quiet-period catch-up check. + +## Provenance + +Surfaced by: Data Integrity, Failure Recovery, Product Context. diff --git a/test/antithesis/scratchbook/properties/kafka-source-no-internal-panic.md b/test/antithesis/scratchbook/properties/kafka-source-no-internal-panic.md new file mode 100644 index 0000000000000..6f6106aedbcce --- /dev/null +++ b/test/antithesis/scratchbook/properties/kafka-source-no-internal-panic.md @@ -0,0 +1,44 @@ +# kafka-source-no-internal-panic + +## Summary + +The explicit panics and `assert!`s in the Kafka source reader never fire under any Antithesis-injected fault sequence. Each site is converted to a uniquely-messaged Antithesis assertion so a firing is a reportable property failure rather than a clusterd crash. + +## Targeted sites + +`src/storage/src/source/kafka.rs`: + +| Line | Site | Antithesis form | +|------|------|------------------| +| 276 | `panic!("unexpected source export details: {:?}", details)` | `assert_unreachable!("kafka: unexpected source export details")` | +| 891 | `assert!(reader.partition_consumers.is_empty())` | `assert_always!(reader.partition_consumers.is_empty(), "kafka: partition_consumers not drained at shutdown")` | +| 1142 | `assert!(self.last_offsets.get(output_index).unwrap().contains_key(&partition))` | `assert_always!(…, "kafka: partition missing from last_offsets")` | +| 1193 | `panic!("got negative offset ({}) from otherwise non-error'd kafka message", msg.offset())` | `assert_unreachable!("kafka: negative offset from non-error message")` | +| 1457 | `assert!(…)` (debug-mode payload validation) | `assert_always!(…, "kafka: payload check")` | + +Plus the cluster of `expect()` sites that are structurally similar — resume-upper missing (265), statistics not initialized (282), restored offset out of `i64` range (345), `position()` failure (606), `partition_known` lookup (853, 855), offset arithmetic (997, 1055, 1060, 1063, 1072, 1082), watermark not negative (1492). These are lower-priority but mass-conversion to `assert_always!(false, ...)` is cheap. + +## Why these sites matter + +- The "negative offset" panic at 1193 is the most interesting: rdkafka has shipped negative offsets in the past under certain protocol bugs, and an `i64` cast that wraps silently would be worse than the panic. Antithesis can reach this through manual broker-state manipulation in the workload. +- The capability-downgrade assertion family (relevant to commit `99ad668af5`'s topic-recreation panic) — currently that code path *logs and continues* rather than panicking, but if a future refactor reintroduces a `panic!` on offset regression, this property catches it. +- The `partition_consumers.is_empty()` assertion at 891 catches a shutdown-ordering bug that would manifest as a clusterd crash on source drop. + +## Antithesis angle + +- Topic deletion + recreation on the Kafka container. Specifically: drop a topic with offsets `[0..1000]`, recreate it with offsets `[0..100]` (lower watermark). The source's resume frontier sees `last_offset = 1000` and rdkafka delivers offset `100`. The dedup at kafka.rs:1158 handles this; the assertion at 1142 catches the case where the *partition itself* is missing from the dedup table. +- Partition rebalance: increase Kafka topic partition count from the broker side mid-run. The metadata fetcher must discover and assign the new partitions correctly. +- Manual offset reset: most relevant for the negative-offset panic at 1193. +- Clock jumps: Kafka's internal timestamp arithmetic uses millisecond offsets; clock jitter has historically interacted poorly with the `expect("kafka sources always have upstream_time")` at line 1209. + +## Existing instrumentation + +The panics and asserts already exist. They currently abort clusterd. The work is wrapping each site with the Antithesis SDK so the abort becomes a reportable, replayable property failure. Each site uses a distinct message naming exactly the invariant violated. + +## Relationship to other properties + +This is the SUT-side counterpart to the workload-level `kafka-source-no-data-loss` and `kafka-source-no-data-duplication`. A workload-level row-count mismatch tells you data is wrong; a fired SUT-side assertion tells you *where* it went wrong. + +## Provenance + +Surfaced by: Failure Recovery, External Dependencies. Regression targets: commits `99ad668af5`, `3e32df1f69`. diff --git a/test/antithesis/scratchbook/properties/kafka-source-survives-broker-fault.md b/test/antithesis/scratchbook/properties/kafka-source-survives-broker-fault.md new file mode 100644 index 0000000000000..fd05df6b47e70 --- /dev/null +++ b/test/antithesis/scratchbook/properties/kafka-source-survives-broker-fault.md @@ -0,0 +1,40 @@ +# kafka-source-survives-broker-fault + +## Summary + +After a network partition or Kafka outage that prevents the source from making progress, once connectivity is restored the source resumes ingestion and eventually visits every message produced during the outage. + +## Code paths + +- `src/storage/src/source/kafka.rs` — `render_reader` polls per-partition `PartitionQueue`s. rdkafka's internal reconnect logic handles broker reconnect; the storage reader must not enter a permanent stall state when the consumer errors out. +- `src/storage/src/healthcheck.rs` — the source's `HealthStatusUpdate` transitions: `Running` → `Stalled { hint }` during the outage → back to `Running` after recovery. `Ceased` would be a violation (terminal failure for a transient fault). +- `src/storage/src/statistics.rs` — `offset_known` and `offset_committed` resume advancing post-recovery. The rehydration-latency reset (commit `0a34b6c79d`) is relevant if the reconnect goes through a dataflow restart. + +## How to check it + +Workload procedure: +1. Produce N messages. +2. Inject a network partition between the `materialized` container and the Kafka container. The partition isolates only that pair; persist/metadata remain reachable. +3. Produce N more messages while the partition is active. +4. Heal the partition (Antithesis fault scheduler) and call `ANTITHESIS_STOP_FAULTS`. +5. Poll `mz_internal.mz_source_statistics_per_worker.offset_committed` until it advances past `max_produced_offset`. Bound the poll loop with a generous timeout. +6. `assert_sometimes!(source_resumed_after_broker_fault, "kafka source resumed after Kafka container partition")`. + +## What goes wrong on violation + +The source enters a permanent stall: rdkafka thinks it's reconnected but the reader never re-reads; or the operator transitions to `Ceased` and the source must be manually dropped/recreated. + +## Antithesis angle + +- Bidirectional network partition: `materialized` ↔ Kafka. +- Asymmetric partition: outbound packets to Kafka dropped but inbound responses allowed (or vice versa). rdkafka may not detect this and may sit waiting for a response forever. +- Repeated short partitions: stress reconnect cadence. +- Kafka container hang (CPU throttling to zero rather than network partition). + +## Existing instrumentation + +None. Workload-level `assert_sometimes!` is the entry point. Optional SUT-side: `assert_sometimes!(kafka_consumer_reconnected, ...)` inside the reader after rdkafka reports a successful reconnect. + +## Provenance + +Surfaced by: Failure Recovery, External Dependencies. diff --git a/test/antithesis/scratchbook/properties/kafka-source-survives-clusterd-restart.md b/test/antithesis/scratchbook/properties/kafka-source-survives-clusterd-restart.md new file mode 100644 index 0000000000000..072f374048ae6 --- /dev/null +++ b/test/antithesis/scratchbook/properties/kafka-source-survives-clusterd-restart.md @@ -0,0 +1,47 @@ +# kafka-source-survives-clusterd-restart + +## Summary + +After clusterd is killed and restarted, the Kafka source recovers its state, computes the correct resume offsets, and ingests messages produced before, during, and after the restart. + +## Code paths + +- `src/storage-client/src/controller.rs` — the storage controller's command-replay logic; this is the entry point for the `storage-command-replay-idempotent` property cluster. +- `src/storage/src/storage_state.rs` — `RunIngestionCommand` handling. The async storage worker serializes ingestion vs. compaction (commit `3e5259782c`). +- `src/storage/src/source/source_reader_pipeline.rs:481-493` — remap operator bootstraps by loading the entire initial batch from the remap shard before resuming new mints. +- `src/storage/src/source/kafka.rs:346-349` — `start_offsets` derived from persisted resume frontier. +- For UPSERT: `src/storage/src/upsert.rs` and `upsert_continual_feedback*.rs` — state reconstruction via the feedback stream (drain all values at or below resume frontier, then transition to normal mint mode). + +## How to check it + +Workload procedure: +1. Produce N messages; wait for source to ingest them. +2. Kill clusterd via Antithesis node-termination fault. +3. Produce M more messages while clusterd is down. +4. Wait for restart, call `ANTITHESIS_STOP_FAULTS`. +5. Poll until `offset_committed >= max_produced_offset`. +6. `assert_sometimes!(clusterd_restart_recovered, "kafka source recovered after clusterd kill")`. Combine with `kafka-source-no-data-duplication` to rule out double-counting; combine with `kafka-source-no-data-loss` to rule out gaps. + +## What goes wrong on violation + +- Resume offset is wrong (too low → duplicates; too high → gap). +- UPSERT state is wrong (stale value per key, or missing keys). +- Source never recovers because remap-shard bootstrap fails. + +## Antithesis angle + +The most interesting timing is a kill *between* the persist sink's `compare_and_append` returning success and the controller's frontier-report channel actually delivering the new frontier upstream. The source on restart must compute its resume frontier from the durably-recorded shard upper, not from any cached or in-flight state. + +For UPSERT specifically: kill during the snapshot phase. The feedback-driven snapshot must restart cleanly and complete with the same final state. + +## Dependency + +Requires **node-termination faults** to be enabled in the Antithesis tenant. Confirm with the user. Without this fault, the property is vacuous. + +## Existing instrumentation + +None. Workload-level assertion only, until SUT-side rehydration anchors are added. Candidate SUT anchors: `assert_sometimes!(snapshot_phase_completed, …)` in the upsert operator's snapshot-completion path, and `assert_sometimes!(remap_bootstrap_complete, …)` in `source_reader_pipeline.rs:481`. + +## Provenance + +Surfaced by: Failure Recovery. Builds on `storage-command-replay-idempotent` and `fault-recovery-exercised`. diff --git a/test/antithesis/scratchbook/properties/offset-known-not-below-committed.md b/test/antithesis/scratchbook/properties/offset-known-not-below-committed.md new file mode 100644 index 0000000000000..7b1d830ee91d6 --- /dev/null +++ b/test/antithesis/scratchbook/properties/offset-known-not-below-committed.md @@ -0,0 +1,39 @@ +# offset-known-not-below-committed + +## Summary + +For every Kafka source, the statistics view always reports `offset_known >= offset_committed`. Causally, what the broker has told us is available cannot lag what Materialize has durably ingested. + +## Code + +- `src/storage/src/statistics.rs` (around line 56-71) — the statistics update path that previously allowed regression. Commit `3e32df1f69` introduced clamping so that on a restart where `offset_known` would be loaded from the broker watermark while `offset_committed` is restored from persist, the metric does not flip into the wrong order. + +## How to check it + +Workload-side polling: + +```sql +SELECT id, offset_known, offset_committed +FROM mz_internal.mz_source_statistics_per_worker +WHERE id = ? +``` + +`assert_always!(offset_known >= offset_committed, "kafka source statistics: offset_known < offset_committed")`. + +SUT-side: mirror as an `assert_always!` inside the statistics update path itself, immediately after both fields are computed but before the value is published. + +## What goes wrong on violation + +The lag metric `offset_known - offset_committed` becomes a small negative number that wraps to a huge positive number in dashboards (commonly displayed as `u64` or with `MAX(0, …)` clamping that hides the actual bug). Operational tooling that drives autoscaling or alerting off lag becomes unreliable. + +## Antithesis angle + +The most interesting timing is the very first sample after a clusterd restart. The order in which the source restores `offset_committed` (from the persist shard upper) and learns `offset_known` (from rdkafka's first metadata response) determines whether the invariant holds during the window where one is set and the other is zero. The fix in commit `3e32df1f69` clamps; Antithesis should verify the clamp covers every interleaving. + +## Existing instrumentation + +None. Pure workload-side polling assertion, optionally mirrored SUT-side. + +## Provenance + +Surfaced by: Data Integrity (metrics correctness). Direct regression target for commit `3e32df1f69`. diff --git a/test/antithesis/scratchbook/properties/reclock-mint-eventually-succeeds.md b/test/antithesis/scratchbook/properties/reclock-mint-eventually-succeeds.md new file mode 100644 index 0000000000000..ee2fb633240e4 --- /dev/null +++ b/test/antithesis/scratchbook/properties/reclock-mint-eventually-succeeds.md @@ -0,0 +1,61 @@ +# reclock-mint-eventually-succeeds + +## Summary + +Under transient persist outages and competing writers, the reclock mint loop (`compare_and_append` with `UpperMismatch` retry, `src/storage/src/source/reclock.rs:160-166`) eventually completes for every source-frontier advance that has data to bind. + +## Code + +```rust +// src/storage/src/source/reclock.rs (around line 150-170) +loop { + match handle.compare_and_append(updates, prev_upper, new_into_upper).await { + Ok(()) => break, + Err(UpperMismatch { current, .. }) => { + self.sync(¤t).await; + // recompute updates and retry + } + } +} +``` + +There is no upper bound on this loop. It depends on the persist backend eventually being responsive and on competing writers not livelocking the source. + +## Why this is a liveness property + +Antithesis's job is to assert that the loop terminates in adversarial schedules. The catalog entry asserts both: + +1. The retry path is *exercised* (the loop runs more than once at least once during a run): `Sometimes(saw_cas_retry)`. +2. The source frontier eventually advances past the contention point: a workload-observable liveness check. + +## How to check it + +SUT-side anchor: +- Add an `assert_sometimes!(reclock_cas_retry_succeeded, "reclock: mint compare_and_append retry succeeded")` immediately after a successful `compare_and_append` that was preceded by at least one `UpperMismatch`. The local counter is reset on each `mint()` invocation. + +Workload-side liveness check: +- After injecting persist consensus latency or a competing-writer scenario, observe the source's `offset_committed` advancing in `mz_internal.mz_source_statistics_per_worker`. `assert_sometimes!(source_advanced_post_contention, …)`. + +## What goes wrong on violation + +The source's frontier stops advancing without any external signal that something is wrong. Health reports `Running`. The reclock operator is in an infinite `compare_and_append` → `UpperMismatch` → `sync` → `compare_and_append` cycle. To an operator looking from outside it looks like Kafka is the problem. + +## Antithesis angle + +- Inject high persist consensus latency. With many concurrent storage workers (or restart-induced competing writers), the CaS contention rate climbs and the retry loop runs many times. Antithesis tests that progress still happens. +- Race the metadata fetcher's partition-add against an in-flight mint. The mint is now reckoning with an extended `source_upper`; the CaS retry must recompute updates correctly. +- Concurrent kill+restart cycles that create competing-writer scenarios. + +## Open question (resolved) + +Q: Is there any input under which `compare_and_append` returns a non-retryable error and the loop should exit? + +A: Yes — `InvalidUsage` errors (handled by `panic!("compare_and_append failed: {invalid_use}")` at `reclock/compat.rs:306`). Those terminate the source. The retry loop only handles `UpperMismatch`. Antithesis fault injection should not produce `InvalidUsage` under correct code; if it does, that is a separate property (`reclock-cas-no-invalid-usage`) but it falls under the broader `kafka-source-no-internal-panic` property already cataloged. + +## Existing instrumentation + +None. The retry loop is silent. + +## Provenance + +Surfaced by: Failure Recovery, Distributed Coordination. diff --git a/test/antithesis/scratchbook/properties/remap-shard-antichain-wellformed.md b/test/antithesis/scratchbook/properties/remap-shard-antichain-wellformed.md new file mode 100644 index 0000000000000..75a17a7446664 --- /dev/null +++ b/test/antithesis/scratchbook/properties/remap-shard-antichain-wellformed.md @@ -0,0 +1,55 @@ +# remap-shard-antichain-wellformed + +## Summary + +At every Materialize timestamp `t`, the contents of the source's remap shard accumulated to `t` form a well-formed `Antichain`. Each source-time element has multiplicity exactly 1; for multi-partition Kafka sources, there is one element per partition range with no overlaps. + +## Origin + +This invariant is stated explicitly in the `ReclockOperator` doc comment (`src/storage/src/source/reclock.rs:31-34`): + +> "The `ReclockOperator` will always maintain the invariant that for any time `IntoTime` the remap collection accumulates into an Antichain where each `FromTime` timestamp has frequency `1`. In other words the remap collection describes a well formed `Antichain` as it is marching forwards." + +## Code paths + +- `src/storage/src/source/reclock.rs:118-169` — `ReclockOperator::mint`. Each call: + 1. Emits retractions (`-1`) of the prior `source_upper`. + 2. Emits insertions (`+1`) of the new `source_upper`. + 3. Calls `compare_and_append` on the remap shard. + 4. On `UpperMismatch`, `sync()` and retry. +- `src/storage/src/source/reclock.rs:124` — `assert!(!new_into_upper.less_equal(&binding_ts))` guards the mint precondition. +- `src/storage/src/source/reclock.rs:321` — `assert!(prev < RB::before(pid))` guards the partition-range ordering. +- `src/storage/src/source/reclock/compat.rs:144` — `assert!` on persist handle state. +- `src/storage/src/source/reclock/compat.rs:306` — `panic!("compare_and_append failed: {invalid_use}")` for genuinely invalid CaS calls. + +## Antithesis form + +Two complementary checks: + +1. **SUT-side** inside `ReclockOperator::sync` / `mint`, after every update: walk the local accumulated state and `assert_always!(antichain_wellformed, "reclock: remap shard accumulates to well-formed antichain")` — every source-time element has multiplicity 1. This is the tightest expression of the invariant. + +2. **Workload-side** as a periodic SQL probe: select the remap shard's contents (via `mz_internal` introspection views if available) and verify the well-formed property externally. This catches the case where the SUT-side check is correct but the durable persist state diverges. + +## What goes wrong on violation + +A malformed remap antichain corrupts every subsequent restart's resume frontier. The source either skips data (resume frontier too far ahead), re-reads data (too far back), or panics in downstream operators that depend on well-formed antichains (e.g., the as_of computation in commit `e3805ad790`). + +## Antithesis angle + +- Concurrent reclock writers across restart: kill the storage worker mid-mint, restart, the new worker must `sync()` the durable state and re-mint from there. If `sync()` is wrong, the new worker may insert without retracting, breaking multiplicity. +- Partition adds/removes interleaved with mints: the partition-range encoding in `RangeBound` is the part that has to stay consistent across discovery and binding. +- `compare_and_append` retry loop interactions: the historical bug at reclock.rs:160-166 was retried correctly, but the cached upper drift (commit `e3805ad790`) bypassed it. + +## Open question (resolved) + +Q: Can the in-memory `source_upper` and the persisted remap state ever diverge enough that the operator emits a malformed update batch? + +A: The `MutableAntichain` in `ReclockOperator::source_upper` is the source of truth for what *should* be persisted next. `mint()` constructs the update batch by diffing the new desired upper against the current `source_upper`. The retraction-insertion structure is what preserves the antichain-multiplicity invariant. The only divergence path is if `sync()` after `UpperMismatch` reads a state inconsistent with what `source_upper` thinks — i.e., a true persist corruption. The assertion at compat.rs:144 is meant to catch this. + +## Existing instrumentation + +The `assert!` and `panic!` calls at reclock.rs:124, :321 and compat.rs:144, :306 exist. None of them check the *accumulated antichain* property directly — they check local invariants. The recommended new assertion is a `assert_always!` over the in-memory accumulator that runs at every state transition. + +## Provenance + +Surfaced by: Data Integrity, Distributed Coordination. Foundational invariant for the entire reclocking subsystem. diff --git a/test/antithesis/scratchbook/properties/upsert-decode-error-retractable.md b/test/antithesis/scratchbook/properties/upsert-decode-error-retractable.md new file mode 100644 index 0000000000000..850914b374346 --- /dev/null +++ b/test/antithesis/scratchbook/properties/upsert-decode-error-retractable.md @@ -0,0 +1,42 @@ +# upsert-decode-error-retractable + +## Summary + +An `UpsertError` (key decode failure, null key, or value decode failure) for a key is retracted once a subsequent valid `(key, value)` message for the same key is ingested. After settling, the source reflects the corrected value and contains no remaining error row for that key. + +This is the upsert envelope's recovery contract for upstream schema mistakes — "fix the bad message and continue" without dropping the source. + +## Code paths + +- `src/storage/src/render/sources.rs` — `upsert_commands` (line ~509-560 and following): maps decode failures to `UpsertError::NullKey` / `KeyDecode` / `Value`. The result still flows through the upsert pipeline keyed by `UpsertKey::from_key(Err(&err))` so a future good value can retract it. +- `src/storage-types/src/errors.rs:161-199` — `EnvelopeError::Upsert(UpsertError)` is the *retractable* error variant. `EnvelopeError::Flat(text)` is explicitly *not retractable*. +- `src/storage/src/upsert.rs:748-750` — error emission paths. + +## How to check it + +Workload procedure: +1. Produce a malformed message for key `K` (e.g., invalid Avro under a schema-registry-backed source, or null key on a non-null-key source). +2. Verify the source contains an error row keyed by `K`. +3. Produce a valid `(K, value)` message. +4. After quiet period, `assert_always!(upsert_error_retracted, "upsert: bad value retracted by subsequent good value")` checking that `SELECT * FROM source WHERE key = K` returns exactly one row with `value`, no error row. + +## What goes wrong on violation + +If the error is not retractable, the source carries a stuck error row that nothing can clear — the only recovery is to drop and re-create the source. + +## Distinguishing retractable from non-retractable + +This property targets `EnvelopeError::Upsert(_)` only. `EnvelopeError::Flat(_)` is explicitly non-retractable and should not be tested with this property. Workloads must take care to produce errors that map to the Upsert variant — null key, malformed key/value under upsert mode — rather than envelope-fatal errors. + +## Antithesis angle + +- Race the bad and good messages closely. Verify ordering is preserved. +- Crash clusterd between the bad message ingesting and the good message ingesting. The error row must persist across the restart and the good message must retract it on resume. + +## Existing instrumentation + +None. Workload-side check. + +## Provenance + +Surfaced by: Protocol Contracts, Failure Recovery. diff --git a/test/antithesis/scratchbook/properties/upsert-ensure-decoded-called-before-access.md b/test/antithesis/scratchbook/properties/upsert-ensure-decoded-called-before-access.md new file mode 100644 index 0000000000000..244fb4a4ed01d --- /dev/null +++ b/test/antithesis/scratchbook/properties/upsert-ensure-decoded-called-before-access.md @@ -0,0 +1,43 @@ +# upsert-ensure-decoded-called-before-access + +## Summary + +The six `StateValue` accessors that require the cell to be in `Value` form are always called after `ensure_decoded` has been called on that cell — the panics that currently guard the type-state protocol never fire. + +## Targeted sites + +`src/storage/src/upsert/types.rs`: + +| Line | Accessor | Message | +|------|----------|---------| +| 297 | `into_decoded` | `panic!("called \`into_decoded without calling \`ensure_decoded\`")` | +| 369 | `into_provisional_value` | `panic!("called \`into_provisional_value\` without calling \`ensure_decoded\`")` | +| 403 | `into_provisional_tombstone` | `panic!("called \`into_provisional_tombstone\` without calling \`ensure_decoded\`")` | +| 416 | `provisional_order` | `panic!("called \`provisional_order\` without calling \`ensure_decoded\`")` | +| 430 | `provisional_value_ref` | `panic!("called \`provisional_value_ref\` without calling \`ensure_decoded\`")` | +| 440 | `into_finalized_value` | `panic!("called \`into_finalized_value\` without calling \`ensure_decoded\`")` | + +Each becomes `assert_unreachable!("upsert: on Consolidating StateValue")` with a distinct, accessor-specific message. + +## Why this is a real property, not just dead code + +Two reasons. + +1. **Refactor net.** The upsert operator has been rewritten twice (`upsert_classic`, `upsert_continual_feedback`, `upsert_continual_feedback_v2`). Every rewrite added new call sites that touch `StateValue`. A future refactor that forgets to call `ensure_decoded` would today abort clusterd; with the Antithesis SDK in place, it surfaces as a property failure during the very first nightly run after the change. +2. **Replay anchors.** If Antithesis ever does trip one of these, the failure pinpoints the exact accessor and code path. That is materially more useful than a stack trace from a process abort, especially in a multi-replica scenario where the abort is invisible behind clusterd's auto-restart. + +## What this property does *not* catch + +This property only checks the type-state protocol — "ensure_decoded was called first." It does not check that the consolidating math itself is correct (that is `upsert-state-consolidation-wellformed`). The two are complementary. + +## Antithesis angle + +These panics are most likely to fire after a code change to the upsert operator's hot path. Antithesis exercises every operator branch with random fault injection — it should reach the rewrite-sensitive accessor sites if any exist. Cost of instrumenting is trivial (rename `panic!` to `assert_unreachable!`); the value is the regression net. + +## Existing instrumentation + +The `panic!`s already exist. They abort the process on misuse. The work is wrapping each with `assert_unreachable!` so the misuse is reported. + +## Provenance + +Surfaced by: Wildcard (this is the type-state guard family that doesn't fit a standard focus). diff --git a/test/antithesis/scratchbook/properties/upsert-key-reflects-latest-value.md b/test/antithesis/scratchbook/properties/upsert-key-reflects-latest-value.md new file mode 100644 index 0000000000000..90341358df926 --- /dev/null +++ b/test/antithesis/scratchbook/properties/upsert-key-reflects-latest-value.md @@ -0,0 +1,63 @@ +# upsert-key-reflects-latest-value + +## Summary + +At a settled timestamp, every key in an UPSERT-envelope source maps to the value from the last `(key, value)` message produced — or to no row if the last message for that key was a tombstone. + +## Code paths + +- `src/storage/src/render/sources.rs` — `upsert_commands` converts `DecodeResult` into `(UpsertKey, Option, FromTime)`. `UpsertKey` is a SHA-256 of the key bytes (collision probability `2^-128`). +- `src/storage/src/upsert.rs` — `upsert_classic`: the main operator. For each input update at `from_time`: + 1. `multi_get(key)` → returns prior value + prior order key. + 2. Skip if `from_time <= prior_order` (stale update). + 3. Emit retraction of prior value at the new timestamp. + 4. Emit insertion of new value at the new timestamp. + 5. `multi_put(key, new_value)` updates the state store. +- `src/storage/src/upsert_continual_feedback.rs` and `_v2.rs` — alternative implementations driven by persist feedback. Same contract, different consolidation strategy. +- `src/storage/src/upsert/types.rs` — `StateValue::ensure_decoded` (~line 589) finalizes the XOR-checksum consolidating state into either a `Value` or a `tombstone`. Critical for snapshot replay correctness. + +## How to check it + +Workload-level: +1. Workload tracks `expected_state: Map>` of what was last produced per key. +2. After fault quiet period, for a sampled set of keys: `SELECT value FROM source WHERE key = ?` and compare to `expected_state[key]`. +3. `assert_always!(upsert_value_matches_latest_produced, "upsert: key value matches latest produced")` — checked on every sample. If the workload notices a divergence, it logs the diff (expected vs. observed) for replay. + +## What goes wrong on violation + +The source returns a stale value for a key. The user's downstream MV uses it. The bug is invisible until someone manually compares the source to the upstream system. + +## Antithesis angle + +- Crash clusterd between `multi_get` and `multi_put`. The next incarnation must reconstruct state correctly from feedback. +- Race produce ordering: if Kafka delivers `(k, v1)` then `(k, v2)`, the source's order-key tracking must serialize them. Order-key regression caused a historical panic (commit `f177db8286`, materialize#26655). +- For RocksDB backend: race `multi_put` against the merge operator running async. +- For multi-replica: both replicas process the same key concurrently (commit `1accbe28b3`). + +## Open question (resolved) + +Q: Does the workload need to know about the per-source `order_key` to validate, or is `from_time` ordering sufficient? + +A: For correctness asserting at quiet periods, the workload only needs the *Kafka* produce order — the operator's job is to translate that into the correct visible value. Since Antithesis injects faults but doesn't reorder Kafka's per-partition delivery, the workload can rely on per-partition produce order to determine `expected_state`. Cross-partition reordering is not a concern because the workload assigns each key to a fixed partition. + +## Existing instrumentation + +None. Pure workload-side check. Optional SUT anchor: an `assert_sometimes!(upsert_emit_correct_retraction, …)` inside `upsert.rs` after a retraction is emitted whose prior value matched what was stored — this gives Antithesis a positive signal that the prior-value-lookup path is being exercised. + +## Implementation status + +Implemented 2026-05-11 as `test/antithesis/workload/test/parallel_driver_upsert_latest_value.py`. Three assertion messages, each unique: + +| Message | Type | When | +|---------|------|------| +| `"upsert: SELECT for key matches latest produced value"` | `always` | Per sampled live key after quiet-period catchup | +| `"upsert: tombstoned key has no row in source"` | `always` | Per sampled key whose last produced message was a tombstone | +| `"upsert: source caught up to produced offsets after quiet period"` | `sometimes` | Once per invocation; liveness anchor proving the safety assertions ran against settled data | + +Shared helpers introduced for this property and reusable by every subsequent Kafka source property: `helper_pg.py` (resilient pgwire), `helper_kafka.py` (producer + delivery tracker), `helper_quiet.py` (`ANTITHESIS_STOP_FAULTS` wrapper), `helper_random.py` (deterministic randomness with Antithesis SDK), `helper_source_stats.py` (catchup polling on `mz_internal.mz_source_statistics`), `helper_upsert_source.py` (idempotent `CREATE CONNECTION` + `CREATE SOURCE`). + +No SUT-side instrumentation added in this pass — that is the candidate work in `properties/upsert-no-internal-panic.md`, `properties/upsert-state-consolidation-wellformed.md`, and `properties/upsert-ensure-decoded-called-before-access.md`. + +## Provenance + +Surfaced by: Data Integrity, Concurrency. Direct regression target for materialize#26655. diff --git a/test/antithesis/scratchbook/properties/upsert-no-internal-panic.md b/test/antithesis/scratchbook/properties/upsert-no-internal-panic.md new file mode 100644 index 0000000000000..e9d097626e601 --- /dev/null +++ b/test/antithesis/scratchbook/properties/upsert-no-internal-panic.md @@ -0,0 +1,43 @@ +# upsert-no-internal-panic + +## Summary + +The upsert operator's explicit `assert!`s and `panic!`s — currently process-aborting guards — never fire under any Antithesis-injected fault sequence. Each site is converted to a uniquely-messaged `assert_always!` / `assert_unreachable!` so a firing surfaces as a reportable Antithesis property failure rather than a clusterd crash. + +## Targeted assertion sites + +| File | Line | Site | Antithesis form | +|------|------|------|------------------| +| `src/storage/src/upsert.rs` | 541 | `assert!(diff.is_positive(), "invalid upsert input")` | `assert_always!(diff.is_positive(), "upsert: input diff positive (classic)")` | +| `src/storage/src/upsert.rs` | 636 | `panic!("key missing from commands_state")` | `assert_unreachable!("upsert: key missing from commands_state (classic)")` | +| `src/storage/src/upsert.rs` | 1031 | `unreachable!("pending future never returns")` | `assert_unreachable!("upsert: pending future returned (classic)")` | +| `src/storage/src/upsert_continual_feedback.rs` | 626 | `assert!(diff.is_positive(), "invalid upsert input")` | `assert_always!(diff.is_positive(), "upsert: input diff positive (cf v1)")` | +| `src/storage/src/upsert_continual_feedback.rs` | 800 | `panic!("key missing from commands_state")` | `assert_unreachable!("upsert: key missing from commands_state (cf v1)")` | +| `src/storage/src/upsert_continual_feedback_v2.rs` | 315 | `assert!(diff.is_positive(), "invalid upsert input")` | `assert_always!(diff.is_positive(), "upsert: input diff positive (cf v2)")` | +| `src/storage/src/upsert_continual_feedback_v2.rs` | 483 | `unreachable!()` on `(None, None)` from joined prior/new state | `assert_unreachable!("upsert: cf v2 join produced (None, None)")` | +| `src/storage/src/upsert/types.rs` | 580 | `panic!("merge_update_state called with non-consolidating state")` | `assert_unreachable!("upsert: merge_update_state on non-Consolidating state")` | +| `src/storage/src/upsert/types.rs` | 1062 | `panic!("attempted completion of already completed upsert snapshot")` | `assert_unreachable!("upsert: snapshot completion called twice")` | + +Each message is unique; an Antithesis failure report names exactly the site that was reached. + +## Why these sites + +These are structural invariants the operator's authors believed to be impossible. Bug history confirms several have fired in production (commits `f177db8286`, `1accbe28b3`). The cost of wrapping them with the Antithesis SDK is trivial; the upside is reportable, replayable property failures. + +## Antithesis angle + +- Multi-replica clusters: most relevant for `key missing from commands_state` and the `unreachable!` on `(None, None)`. +- Order-key edge cases: maps to the `assert!(diff.is_positive())` family. +- Snapshot completion: the `panic!("attempted completion of already completed upsert snapshot")` is reached if the snapshot-completion state machine is re-entered (rehydration after a crash that already completed snapshot). + +## Relationship to other properties + +This property is the *operator-internal* counterpart to `upsert-state-consolidation-wellformed` (which guards the math in `ensure_decoded`) and `upsert-ensure-decoded-called-before-access` (which guards the type-state protocol on `StateValue` accessors). Together they form the SUT-side instrumentation backbone for the UPSERT envelope. + +## Existing instrumentation + +The `assert!` / `panic!` calls already exist as process-aborting guards. They abort in test today; the work is converting them to `assert_always!`/`assert_unreachable!` so failures are *reported* rather than masked as "clusterd was restarted." Each site gets a distinct, specific message per the property-catalog requirement that assertion messages be unique. + +## Provenance + +Surfaced by: Concurrency, Failure Recovery. Regression targets: commits `f177db8286`, `1accbe28b3`, materialize#26655, database-issues#9160. diff --git a/test/antithesis/scratchbook/properties/upsert-state-consolidation-wellformed.md b/test/antithesis/scratchbook/properties/upsert-state-consolidation-wellformed.md new file mode 100644 index 0000000000000..d65161bba6766 --- /dev/null +++ b/test/antithesis/scratchbook/properties/upsert-state-consolidation-wellformed.md @@ -0,0 +1,75 @@ +# upsert-state-consolidation-wellformed + +## Summary + +`StateValue::ensure_decoded` always finalizes a `Consolidating` cell into either a `Value(value)` (when `diff_sum == 1` and the recovered bytes match the stored `len_sum` and seahash `checksum_sum`) or a `tombstone()` (when `diff_sum == 0` and the entire accumulator is zero). Any other state — non-{0,1} `diff_sum`, mismatched checksum, non-zero residue on a tombstone — is an XOR/accounting corruption and must never be observed. + +## Code + +`src/storage/src/upsert/types.rs:584-682`: + +```rust +pub fn ensure_decoded(&mut self, bincode_opts, source_id, key) { + match self { + StateValue::Consolidating(consolidating) => { + match consolidating.diff_sum.0 { + 1 => { + let len = usize::try_from(consolidating.len_sum.0)...expect(...); + let value = &consolidating.value_xor.get(..len)...expect(...); + assert_eq!(consolidating.checksum_sum.0, seahash::hash(value) as i64, ...); + *self = Self::finalized_value(bincode_opts.deserialize(value).unwrap()); + } + 0 => { + assert_eq!(consolidating.len_sum.0, 0, ...); + assert_eq!(consolidating.checksum_sum.0, 0, ...); + assert!(consolidating.value_xor.iter().all(|&x| x == 0), ...); + *self = Self::tombstone(); + } + other => panic!("invalid upsert state: non 0/1 diff_sum: {other}, ..."), + } + } + StateValue::Value(_) => {} + } +} +``` + +## Antithesis form + +Each of the four assertions in this function becomes a uniquely-messaged `assert_always!`: + +| Existing | Antithesis form | Message | +|---|---|---| +| `assert_eq!(checksum_sum, seahash::hash(value))` (621) | `assert_always!(checksum_sum == seahash::hash(value), …)` | `"upsert: consolidating checksum_sum mismatch (diff_sum=1)"` | +| `assert_eq!(len_sum, 0)` (632) | `assert_always!(len_sum == 0, …)` | `"upsert: consolidating len_sum nonzero (diff_sum=0)"` | +| `assert_eq!(checksum_sum, 0)` (637) | `assert_always!(checksum_sum == 0, …)` | `"upsert: consolidating checksum_sum nonzero (diff_sum=0)"` | +| `assert!(value_xor.iter().all(==0))` (642) | `assert_always!(value_xor.iter().all(==0), …)` | `"upsert: consolidating value_xor nonzero (diff_sum=0)"` | +| `panic!("invalid upsert state: non 0/1 diff_sum: {other}, …")` (672) | `assert_always!(false, …)` | `"upsert: consolidating diff_sum not in {0,1}"` | + +Plus the two `expect("invalid upsert state")` calls at 606 and 619 (slice-into-bytes failures); these should become `assert_always!(value_xor.len() >= len, …)` with a distinct message. + +## What goes wrong on violation + +The XOR-based consolidation collapses many `(diff, bytes)` updates per key into a single accumulator. The math only works if every retraction is exactly paired with its insertion. A trip into the non-{0,1} branch indicates one of: + +- A duplicate retraction (commit `1accbe28b3` style multi-replica double-drain). +- A retraction without a matching insertion in the replay stream (incomplete feedback delivery across crash). +- A `seahash` collision (negligible probability — if seen, it's a bug elsewhere, not the hash). +- A bug in the `merge_update_state` math (`upsert/types.rs:533+`). + +## Antithesis angle + +- Kill clusterd mid-feedback-replay; restart and assert that `ensure_decoded` always completes cleanly. +- Multi-replica with concurrent drains feeding the same RocksDB backend. +- Race RocksDB's async merge operator against `multi_put`. + +## Why this is the deepest signal + +The XOR/checksum consolidation is the *math*: if this assertion ever trips, something upstream — feedback delivery, retraction emission, or order-key tracking — produced an inconsistent update sequence. The signal is high because the assertion is at the *bottom* of the pipeline; everything else has had a chance to introduce the bug, but only this site can detect it. + +## Existing instrumentation + +The runtime `panic!` and `assert!`s already exist and would abort clusterd on violation. Today, an abort in test looks like "the storage worker crashed" — possibly retried, possibly noticed only via a log scrape. Wrapping them with Antithesis assertions turns each into a reportable, replay-anchored property failure with a unique signature. + +## Provenance + +Surfaced by: Data Integrity, Concurrency (via the multi-replica drain bug history). diff --git a/test/antithesis/scratchbook/properties/upsert-state-rehydrates-correctly.md b/test/antithesis/scratchbook/properties/upsert-state-rehydrates-correctly.md new file mode 100644 index 0000000000000..336deb408759b --- /dev/null +++ b/test/antithesis/scratchbook/properties/upsert-state-rehydrates-correctly.md @@ -0,0 +1,46 @@ +# upsert-state-rehydrates-correctly + +## Summary + +After a clusterd restart, the rehydrated upsert state — observed via `SELECT * FROM source` — equals the state at the most recent durable timestamp before the restart, for every key produced so far. + +## Code paths + +- `src/storage/src/upsert.rs:791-799` — snapshot phase: drain input at `resume_upper` boundary, all snapshot values marked with `provisional_order = None` (sorts lowest). +- `src/storage/src/upsert/types.rs:1062` — `panic!("attempted completion of already completed upsert snapshot")` is the guard for the snapshot-completion state machine. +- `src/storage/src/upsert/types.rs:584-682` — `StateValue::ensure_decoded` finalizes the consolidating state. The `diff_sum ∈ {0, 1}` invariant must hold at completion time. +- `src/storage/src/upsert_continual_feedback.rs` — the continual-feedback variant uses a persist `Listen` to receive feedback values; the same correctness contract applies. + +## How to check it + +Workload procedure: +1. Produce many `(key, value)` and `(key, null)` messages; track `expected_state`. +2. Wait for `offset_committed` to advance past last produced offset. +3. Snapshot `expected_state` and the source's `SELECT * FROM source` content side-by-side; assert equality. +4. Kill clusterd; wait for restart and quiet period. +5. Re-run the comparison: `SELECT * FROM source` must equal the pre-kill snapshot. +6. `assert_always!(upsert_state_rehydrated_correctly, "upsert: rehydrated state equals pre-restart state")`. + +## What goes wrong on violation + +The source comes back with wrong values per key, missing keys, or keys that should be tombstoned but are present. The bug is silent — the source reports healthy and the workload sees plausible-but-wrong data. + +## Antithesis angle + +The interesting window is between the persist sink's `compare_and_append` succeeding for batch N and the upsert operator's *next* snapshot-completion. If a crash drops feedback delivery between those two points, the next incarnation's snapshot may see partial state and complete with the wrong tombstone/value mapping. + +Compounded by RocksDB merge operator behavior (commit `0d8d740b47`): if the merge operator interleaves with snapshot completion in a way that drops a tombstone, the rehydrated state diverges. + +## Dependencies + +- Requires node-termination faults enabled. +- Combine with `upsert-state-consolidation-wellformed` (the deeper `ensure_decoded` correctness check) for full coverage of the snapshot path. +- Combine with `kafka-source-no-data-duplication` to rule out the related failure mode where rehydration introduces duplicates rather than wrong values. + +## Existing instrumentation + +None. Candidate SUT anchors: an `assert_sometimes!(upsert_snapshot_completed, "upsert: snapshot phase completed")` at the snapshot-completion call site, and `assert_always!(diff_sum_in_range, …)` mirroring the existing `panic!` in `ensure_decoded`. + +## Provenance + +Surfaced by: Failure Recovery, Data Integrity. diff --git a/test/antithesis/scratchbook/properties/upsert-tombstone-removes-key.md b/test/antithesis/scratchbook/properties/upsert-tombstone-removes-key.md new file mode 100644 index 0000000000000..74f5f13a7ba49 --- /dev/null +++ b/test/antithesis/scratchbook/properties/upsert-tombstone-removes-key.md @@ -0,0 +1,38 @@ +# upsert-tombstone-removes-key + +## Summary + +A `(key, null)` tombstone message eventually removes the key from the UPSERT source, and the key stays absent until a non-null value is produced for it. + +## Code paths + +- `src/storage/src/render/sources.rs` — `upsert_commands` maps `None` value → tombstone signal: `(UpsertKey, None, from_time)`. +- `src/storage/src/upsert.rs` — `upsert_classic`: on `None` value with existing prior value, emit retraction at new timestamp and `multi_put(key, tombstone)`. +- `src/storage/src/upsert/types.rs` — `StateValue::tombstone()` constructor; `ensure_decoded` with `diff_sum == 0` produces this state. + +## How to check it + +Workload procedure: +1. Produce `(key, v)` to topic. +2. Wait for source to ingest it; verify row visible. +3. Produce `(key, null)`. +4. After quiet period, `assert_always!(tombstoned_key_absent, "upsert: tombstoned key has no row")` checking `SELECT count(*) FROM source WHERE key = ? = 0`. +5. Bonus: kill clusterd, restart, assert the row is still absent (no resurrection). + +## What goes wrong on violation + +A deleted row reappears after restart. Compliance and correctness hazard. The likely cause is the snapshot replay misinterpreting a tombstone consolidating state — the `diff_sum == 0` branch of `ensure_decoded` is what guards this. + +## Antithesis angle + +- Crash between tombstone retraction emit and `multi_put(tombstone)`. The state store is now ahead/behind the persisted output; the snapshot replay on restart is what reconciles. +- Race `(k, v)`, `(k, null)`, `(k, v')` deliveries: every interleaving must end with `v'` visible. +- For the no-resurrection half: produce tombstone, wait for `offset_committed` to advance past its offset, then kill clusterd. On restart, the key must not reappear. + +## Existing instrumentation + +None. Workload-side check. The `StateValue::tombstone` construction path and the `ensure_decoded` tombstone branch are the relevant code; adding `assert_sometimes!(tombstone_emitted, ...)` inside the tombstone-emit path gives a coverage signal. + +## Provenance + +Surfaced by: Data Integrity, Lifecycle Transitions (delete operations). diff --git a/test/antithesis/scratchbook/property-catalog.md b/test/antithesis/scratchbook/property-catalog.md index ffbba999a7031..0645f1e868414 100644 --- a/test/antithesis/scratchbook/property-catalog.md +++ b/test/antithesis/scratchbook/property-catalog.md @@ -1,6 +1,6 @@ --- -commit: ca6deb6758e651876582ae7d4dec24ce32d87567 -updated: 2026-05-06 +commit: 007c7af9d9970fb2030c7212368b232e0fbc363e +updated: 2026-05-11 --- # Property Catalog: Materialize @@ -53,6 +53,17 @@ Properties that verify data correctness when crashes, network partitions, and co | **Antithesis Angle** | Inject network failures on consensus calls mid-flight. Kill writer after batch is queued but before state is committed. Antithesis explores the window between consensus write and acknowledgment. | | **Why It Matters** | Indeterminate errors are the hardest to handle correctly in distributed systems. Duplication or loss here silently corrupts downstream materialized views. Surfaced by: Data Integrity. | +### critical-reader-fence-linearization — Critical Reader Opaque Token Linearizes + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P1 — incorrect fencing allows premature GC causing data loss | +| **Property** | When two concurrent critical readers attempt compare_and_downgrade_since with mismatched opaque tokens, exactly one succeeds in updating the shard's since. No reader can re-observe an old opaque value after a SeqNo increment. | +| **Invariant** | `Always`: concurrent compare_and_downgrade_since operations with different opaques result in exactly one mutation. The winner's opaque is durably recorded; the loser gets a mismatch. | +| **Antithesis Angle** | Inject network delays between state check and state commit. Fail CaS operations after token comparison but before state write. Antithesis explores concurrent reader contention. | +| **Why It Matters** | Critical readers control garbage collection boundaries. Incorrect fencing allows premature GC, which deletes data needed by active readers. Surfaced by: Data Integrity. | + ## Category 2: Consistency Model Enforcement Properties that verify Materialize's strict serializability guarantee and timestamp oracle correctness. @@ -205,13 +216,183 @@ Properties that verify the system reaches interesting states under fault injecti | **Antithesis Angle** | Insert data, inject faults (compute replica crash, storage reconnection), then verify the MV eventually shows the data. Antithesis explores whether faults during the incremental update pipeline cause permanent stalls. | | **Why It Matters** | This is the end-to-end user-visible correctness property. Materialize's value proposition is that MVs are always up-to-date. Surfaced by: Product Context. | -### critical-reader-fence-linearization — Critical Reader Opaque Token Linearizes +## Category 7: Kafka Source Ingestion (Append-Only + UPSERT) + +Properties specific to the Kafka source ingestion pipeline: `KafkaSourceReader` → `ReclockOperator` → optional decode/UPSERT → `persist_sink`. Both envelopes are covered, with shared properties for reclocking and source-frontier behavior. Workload-level checks compare produced Kafka records against what a SQL `SELECT` over the source returns; SUT-side checks live in the source/upsert/reclock operators. + +### kafka-source-no-data-loss — Every Produced Record Is Eventually Visible + +| | | +|---|---| +| **Type** | Liveness | +| **Priority** | P0 — primary user-visible contract; "data is in Kafka but not in Materialize" is the worst possible streaming bug | +| **Property** | After producing a message to a Kafka topic, the Materialize source over that topic eventually contains a row corresponding to that message (NONE envelope) or a row reflecting the latest value for that key (UPSERT envelope). | +| **Invariant** | `Sometimes(all_produced_records_visible)`: at least once during a run, after a quiet period, the workload observes `COUNT(*) FROM source` >= number of produced records (NONE) or every produced (key, value) pair is reflected in the source state (UPSERT). Liveness, so `Sometimes` on the catch-up event. | +| **Antithesis Angle** | Network partitions between Materialize and Kafka, clusterd kills mid-ingestion, persist write retries, and rebalances. The interesting timing is the *crash mid-batch* window: some offsets are in persist, some are not, and the resume frontier determines what we re-read. Antithesis explores whether the re-read covers exactly the missing offsets. | +| **Why It Matters** | This is the headline guarantee of a streaming database. A bug here is silent data loss visible to every user of the source. Supersedes the more generic `source-ingestion-progress` for Kafka specifically. | + +### kafka-source-no-data-duplication — No Record Appears Twice After Settling | | | |---|---| | **Type** | Safety | -| **Priority** | P1 — incorrect fencing allows premature GC causing data loss | -| **Property** | When two concurrent critical readers attempt compare_and_downgrade_since with mismatched opaque tokens, exactly one succeeds in updating the shard's since. No reader can re-observe an old opaque value after a SeqNo increment. | -| **Invariant** | `Always`: concurrent compare_and_downgrade_since operations with different opaques result in exactly one mutation. The winner's opaque is durably recorded; the loser gets a mismatch. | -| **Antithesis Angle** | Inject network delays between state check and state commit. Fail CaS operations after token comparison but before state write. Antithesis explores concurrent reader contention. | -| **Why It Matters** | Critical readers control garbage collection boundaries. Incorrect fencing allows premature GC, which deletes data needed by active readers. Surfaced by: Data Integrity. | +| **Priority** | P0 — silent duplication corrupts every aggregate downstream MV | +| **Property** | After settling, the NONE-envelope source contains at most one row per `(partition, offset)` tuple; the UPSERT-envelope source contains at most one row per key. | +| **Invariant** | `Always`: `SELECT partition, "offset", COUNT(*) FROM source GROUP BY 1,2 HAVING COUNT(*) > 1` returns no rows for NONE; `SELECT key, COUNT(*) FROM source GROUP BY 1 HAVING COUNT(*) > 1` returns no rows for UPSERT. Checked on every assertion firing — must hold on every observation. | +| **Antithesis Angle** | Reader crashes between persist-sink batch write and `compare_and_append`; rehydration re-reads offsets we already wrote. The protection lives in `last_offsets` filtering (kafka.rs:1158) but only for the *current* incarnation — across restart, idempotency depends on the persist sink and (for UPSERT) the feedback-driven snapshot. Antithesis explores crash/restart timing across batch boundaries. Direct regression target for upsert double-retraction bug (commit 1accbe28b3, database-issues#9160). | +| **Why It Matters** | Duplicate rows in the source flow into every downstream materialized view's aggregates and joins. Silent and devastating. | + +### kafka-source-frontier-monotonic — Source Persist Shard Upper Never Regresses + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P1 — frontier regression panics downstream operators and breaks `AS OF` queries | +| **Property** | The `upper` frontier of the source's data persist shard never regresses across the lifetime of the source, including across clusterd restarts and `compare_and_append` retries. | +| **Invariant** | `Always`: observed `upper(t2) >= upper(t1)` for any observation order `t1 < t2`. Checked on every observation in a workload polling loop, and ideally also as a SUT-side `assert_always!` next to the persist sink's `compare_and_append`. | +| **Antithesis Angle** | Kill clusterd mid-`compare_and_append`; resume the source with a stale cached upper; concurrent reclock and persist-sink writers. Direct regression target for the `as_of`/reclock-upper race (commit e3805ad790, database-issues#8698) and the persist-sink cached upper bug (commit 505dc96aaa). | +| **Why It Matters** | Frontier regression manifests as panics (`as_of > upper`) or as observably incorrect AS OF queries. Documented invariant for persist. | + +### kafka-source-survives-broker-fault — Source Resumes After Broker Connectivity Restored + +| | | +|---|---| +| **Type** | Liveness | +| **Priority** | P1 — operational expectation; broker faults are a routine condition | +| **Property** | After a transient network partition or Kafka broker outage that prevents the source from making progress, once connectivity is restored, the source eventually ingests all messages that were produced during the outage. | +| **Invariant** | `Sometimes(source_resumes_after_broker_fault)`: at least once per run, after injecting a network fault between materialized and Kafka and then calling `ANTITHESIS_STOP_FAULTS`, the workload observes the source's `COUNT(*)` advance past its pre-fault value. | +| **Antithesis Angle** | Network partition between the `materialized` container and the Kafka container; persist+metadata stay reachable. Tests rdkafka reconnect, snapshot statistics restoration (commit 0a34b6c79d), and that no permanent stall mode is entered. | +| **Why It Matters** | Cloud streaming setups routinely see transient Kafka unavailability. A source that gets stuck and never recovers is an outage. | + +### kafka-source-survives-clusterd-restart — Source Resumes After clusterd Crash + +| | | +|---|---| +| **Type** | Liveness | +| **Priority** | P1 — recovery from clusterd kill is the most common operational fault path | +| **Property** | After clusterd (storage worker) is killed and restarted, the Kafka source recovers, replays the right resume offsets, and ingests messages produced before, during, and after the restart. | +| **Invariant** | `Sometimes(source_recovered_after_clusterd_restart)`: after a kill+restart, eventually `COUNT(*) FROM source >= produced_count`. Combined with `kafka-source-no-data-duplication` to also rule out double-counting. | +| **Antithesis Angle** | Direct test of the `storage-command-replay-idempotent` mechanism end-to-end through Kafka. Antithesis explores crash timing across the reclock mint, persist-sink append, and upsert snapshot-completion windows. Requires node-termination faults to be enabled. | +| **Why It Matters** | This is the recovery contract the storage controller is built around. Failure here makes every higher-level property meaningless. | + +### upsert-key-reflects-latest-value — UPSERT Source Reflects Latest Value Per Key + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P0 — the entire user-visible promise of the UPSERT envelope | +| **Status** | **Implemented** (workload-side) — `test/antithesis/workload/test/parallel_driver_upsert_latest_value.py`. Two `always()` assertions ("upsert: SELECT for key matches latest produced value", "upsert: tombstoned key has no row in source") plus one `sometimes()` liveness anchor ("upsert: source caught up to produced offsets after quiet period"). | +| **Property** | At a settled timestamp, for each key produced by the workload, the UPSERT source contains exactly the value from the last `(key, value)` message produced — or no row if the last message for that key was a tombstone. | +| **Invariant** | `Always`: for every workload-tracked key, `SELECT value FROM source WHERE key = ?` returns the expected value (or empty for tombstoned keys), as determined by the workload's local model of what it produced. Checked after `ANTITHESIS_STOP_FAULTS` quiet periods. | +| **Antithesis Angle** | Reorder produce timing, kill clusterd between the prior-value lookup (`multi_get`) and the new-value write (`multi_put`), inject delays in the feedback-driven snapshot phase. Tests order-key monotonicity (commit f177db8286), state-backend consistency, and snapshot-completion correctness. | +| **Why It Matters** | UPSERT semantics — "the source mirrors the upstream key/value store" — is the reason customers pick this envelope. Wrong value per key is silent corruption that flows into all downstream MVs. | + +### upsert-tombstone-removes-key — Tombstone Eventually Removes the Key + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P1 — delete semantics are routinely relied on for GDPR/correctness | +| **Property** | After producing a `(key, null)` tombstone message to the Kafka topic, the UPSERT source eventually contains no row for that key, and the row stays absent until a new non-null value is produced. | +| **Invariant** | `Always`: at any settled observation after the tombstone has been ingested (resume_upper > tombstone offset), `SELECT * FROM source WHERE key = ?` returns 0 rows. The "no resurrection" half is also `Always`: a key that has been tombstoned and not re-inserted must not reappear after a clusterd restart or rehydration cycle. | +| **Antithesis Angle** | Race the tombstone against a state-store snapshot completion. Crash clusterd between persist sink writing the retraction and the upsert state recording the tombstone. The `StateValue::Value` -> tombstone path in `upsert/types.rs` is the relevant code; bugs here look like resurrected rows. | +| **Why It Matters** | A "deleted" row reappearing is both a correctness bug and a compliance hazard. | + +### upsert-state-rehydrates-correctly — UPSERT State Reconstructs Exactly After Restart + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P1 — incorrect rehydration produces wrong-but-plausible-looking output | +| **Property** | After a clusterd restart, the rehydrated upsert state, as observed via `SELECT * FROM source`, equals the state at the most recent durable timestamp before the restart, for every key produced so far. | +| **Invariant** | `Always`: after a kill+restart quiet period, the workload's local key/value model matches the source's contents for every key whose latest message has `offset <= resume_upper`. Combines with `kafka-source-no-data-duplication` (no double inserts on rehydration) and `upsert-key-reflects-latest-value` (correct value per key). | +| **Antithesis Angle** | The interesting window is between `compare_and_append` of the persist sink and the upsert operator's feedback-driven snapshot completion. If the feedback replay deduplication is wrong, rehydrated state diverges from durable state. Direct regression target for the upsert snapshot-completion logic in `upsert/types.rs` and `upsert_continual_feedback*`. | +| **Why It Matters** | Wrong rehydration is silent — the source comes up "healthy" and serves bad data. Hardest class of bug to detect in production. | + +### upsert-decode-error-retractable — Bad Value Errors Are Retracted By Subsequent Good Value + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P2 — documented contract; supports operational "fix the bad message and continue" recovery | +| **Property** | When a Kafka message decoding produces an `UpsertError::Value` (or `UpsertError::KeyDecode` or `UpsertError::NullKey`) for a key, and a subsequent message produces a valid `(key, value)` pair for the same key, the source state for that key transitions from "row containing error" to "row containing the new value" — i.e. the error is retracted. | +| **Invariant** | `Always`: at a settled timestamp after the corrective message has been ingested, `SELECT * FROM source WHERE key = ?` returns the corrected value with no remaining error row. Note this is the *upsert*-specific retractability (`EnvelopeError::Upsert(..)`); `EnvelopeError::Flat(..)` is explicitly non-retractable. | +| **Antithesis Angle** | Produce an undecodable value, then a good value for the same key, while injecting delays between the two. Race against snapshot completion (errored value during snapshot vs. corrected value post-snapshot). | +| **Why It Matters** | Encoded as the operational contract by which users recover from upstream schema mistakes without dropping the source. Code in `upsert_commands` (render/sources.rs) and `upsert.rs` is the relevant path. | + +### upsert-no-internal-panic — Upsert Operator's Internal Asserts Never Fire + +| | | +|---|---| +| **Type** | Reachability (Unreachable) | +| **Priority** | P1 — these panics are explicit "should-never-happen" guards that bug history has hit | +| **Property** | The explicit panics and `assert!`s in the upsert operator never fire under any Antithesis-injected fault sequence. Specifically: `assert!(diff.is_positive(), "invalid upsert input")` (upsert.rs:541, upsert_continual_feedback.rs:626, v2:315); `panic!("key missing from commands_state")` (upsert.rs:636, upsert_continual_feedback.rs:800); `unreachable!()` for `(None, None)` in continual-feedback v2 (v2:483); the order-key panic that used to live in `drain_staged_input` (now a skip; commit f177db8286). | +| **Invariant** | `Unreachable`: each of these sites is converted to an Antithesis `assert_unreachable!("…")` (or `assert_always!(false, …)`) so that any firing produces an explicit Antithesis property failure rather than a process crash. Distinct, unique message per site. | +| **Antithesis Angle** | These are the high-signal SUT-side anchors. They catch the same family of bugs that historically reached production: order-key regression, missing dedup entry, retraction-on-input. Adding them costs almost nothing in the SUT and gives Antithesis precise replay anchors. | +| **Why It Matters** | These panics indicate the operator entered an internal state its author thought was impossible. Past bugs (commits f177db8286, 1accbe28b3) reached production exactly through these paths. The asserts already exist; we just need to wrap them with the Antithesis SDK so the failures become reportable properties rather than process kills. | + +### upsert-state-consolidation-wellformed — `ensure_decoded` Resolves To `diff_sum ∈ {0, 1}` With Matching Checksums + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P0 — directly guards upsert state-store data integrity; catches XOR/checksum corruption | +| **Property** | When the upsert state backend's `StateValue::ensure_decoded` finalizes a `Consolidating` cell into either a live `Value` or a `tombstone`, the consolidating accumulator is well-formed: `diff_sum ∈ {0, 1}`; if `diff_sum == 1` the recovered bytes match the recorded `len_sum` and `checksum_sum` (seahash of `value_xor[..len_sum]`); if `diff_sum == 0` then `len_sum == 0`, `checksum_sum == 0`, and every byte of `value_xor` is zero. | +| **Invariant** | `Always`: the `panic!("invalid upsert state: non 0/1 diff_sum: …")` at `upsert/types.rs:672` becomes an `assert_always!(false, "upsert: non 0/1 diff_sum")` with a unique message. The intermediate `assert_eq!`s at :621, :632, :637 and the `assert!` at :642 are likewise upgraded to `assert_always!` so they report rather than crash. Each site gets a distinct, specific message. | +| **Antithesis Angle** | The consolidating state collapses many `(diff, bytes)` updates per key into running `diff_sum`, `len_sum`, `checksum_sum`, and an XOR-merged `value_xor` blob. The invariant relies on (a) every retraction being paired with an identical insertion in the snapshot stream, and (b) the snapshot completion contract delivering exactly the durable state at the resume frontier. Antithesis explores: crash mid-snapshot-replay, RocksDB merge operator interleaved with multi_put, partial feedback delivery across restart, and (most subtly) duplicated retractions from multi-replica drain (commit 1accbe28b3). Any of these can break the XOR cancellation and trip a non-{0,1} diff_sum. | +| **Why It Matters** | This is the deepest "the math broke" guard in the upsert pipeline. A trip here means either the feedback stream replayed wrong contents or a duplicate retraction snuck through. The existing panic already dumps a rich diagnostic — wrapping it as an Antithesis assertion turns it into a reportable, replayable property failure rather than a process abort. | + +### upsert-ensure-decoded-called-before-access — Consolidating State Is Always Decoded Before Use + +| | | +|---|---| +| **Type** | Reachability (Unreachable) | +| **Priority** | P2 — type-state protocol invariant; high-signal as a replay anchor | +| **Property** | Every accessor on `StateValue` that requires the cell to be in `Value` form is preceded by a call to `ensure_decoded` for that cell. The six accessor panics — `into_decoded` (297), `into_provisional_value` (369), `into_provisional_tombstone` (403), `provisional_order` (416), `provisional_value_ref` (430), `into_finalized_value` (440) — never fire. | +| **Invariant** | `Unreachable`: each `panic!("called \`...\` without calling \`ensure_decoded\`")` site is converted to a distinct `assert_unreachable!("upsert: on Consolidating")`. Six unique assertion messages, one per accessor, so an Antithesis report distinguishes which contract was violated. These are pure protocol-misuse guards — they cannot fire in valid execution. | +| **Antithesis Angle** | These panics are most likely to fire after a code change to the upsert operator (e.g. a new code path that forgets `ensure_decoded` before reading `provisional_value`). Antithesis exercises every operator branch under fault injection; turning these into reachability assertions gives a cheap regression-detection net for future refactors of `upsert.rs` / `upsert_continual_feedback*.rs`. They are also useful replay anchors — if Antithesis ever does reach them, the bug is reproducible. | +| **Why It Matters** | These guard a type-state contract that is currently enforced only at runtime. The cost of instrumenting them is essentially zero (rename `panic!` to `assert_unreachable!`), and the upside is that any future violation surfaces as a property failure that can be replayed deterministically. | + +### kafka-source-no-internal-panic — Kafka Source Reader's Explicit Panics Never Fire + +| | | +|---|---| +| **Type** | Reachability (Unreachable) | +| **Priority** | P1 — direct regression target for topic-recreation and offset-handling bugs | +| **Property** | The explicit panics in `kafka.rs` never fire: `panic!("got negative offset (...)")` (kafka.rs:1193); `panic!("unexpected source export details: ...")` (kafka.rs:276); the `assert!(self.last_offsets[output][partition])` (kafka.rs:1142); plus the `expect()` sites on resume-upper / statistics / offset arithmetic. | +| **Invariant** | `Unreachable`: each site converted to a unique `assert_unreachable!("kafka: ")`. The "negative offset" panic in particular is a known structural-invariant violation that has fired before. | +| **Antithesis Angle** | Topic deletion + recreation, partition rebalancing, manual offset reset on the Kafka broker, clock jumps that interact with Kafka's internal offset arithmetic. Direct regression target for commit 99ad668af5 (capability downgrade on topic recreation). | +| **Why It Matters** | A panic in the source reader takes down the storage worker. Replacing the panic with an Antithesis assertion gives a *reportable* failure rather than a crash that masks itself as "clusterd was restarted." | + +### remap-shard-antichain-wellformed — Remap Shard Accumulates To Well-Formed Antichain + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P1 — load-bearing invariant for reclock correctness; explicitly stated in source doc comment | +| **Property** | At every Materialize timestamp `t`, the remap shard's contents accumulated to `t` form a well-formed `Antichain`: each source-time element has frequency exactly 1, the antichain is not empty if any source data has been bound, and (under multi-partition source) there is one element per partition range with no overlaps. | +| **Invariant** | `Always`: enforced as an `assert_always!` inside `ReclockOperator::mint`/`sync` after every state update — that's where the doc comment promises the invariant (reclock.rs:31-34). Workload-level approximation: a periodic SQL query that joins source/remap progress with computed offsets and verifies one-to-one. | +| **Antithesis Angle** | Concurrent reclock writers (across restart), partition adds/removes between mints, `compare_and_append` retries that interleave with metadata refresh. The remap shard is the only place where source-time → into-time is durably recorded; a malformed antichain corrupts every subsequent restart's resume frontier. | +| **Why It Matters** | This is the foundational reclock invariant. Violation here breaks recovery (resume_upper computed wrong), `AS OF` semantics, and the upsert operator's snapshot phase. | + +### reclock-mint-eventually-succeeds — Reclock Mint Completes Despite CaS Retries + +| | | +|---|---| +| **Type** | Liveness | +| **Priority** | P2 — pre-existing concern under persist instability | +| **Property** | Under transient persist outages or competing writers, the reclock mint loop (`compare_and_append` with `UpperMismatch` retry, reclock.rs:160-166) eventually completes for every source-frontier advance that has data to bind. | +| **Invariant** | `Sometimes(mint_completed_after_cas_retry)`: at least once per run, Antithesis observes a reclock mint that took >1 CaS attempt and then completed (i.e. a successful retry path was exercised). Critically, the workload should also observe that the source frontier eventually advances past the value of `source_upper` captured at the time of the contention — i.e. the loop is not livelocked. | +| **Antithesis Angle** | Inject persist consensus latency, kill+restart concurrently to create a competing writer, race the metadata fetcher's partition-add against a mint that is already in flight. The retry loop in `mint()` has no upper bound; this property confirms it is not livelocked even under adversarial schedules. | +| **Why It Matters** | A livelocked mint loop manifests as a source that never advances its frontier — externally indistinguishable from a stalled Kafka consumer, but caused inside Materialize. | + +### offset-known-not-below-committed — Source Statistics Causality + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P2 — observable statistics correctness; regression target for commit 3e32df1f69 | +| **Property** | For every Kafka source, the source-statistics view always reports `offset_known >= offset_committed`. The metric `offset_known` reflects what the broker has told us is available; `offset_committed` reflects what Materialize has durably ingested. Causally, `offset_known` cannot lag `offset_committed`. | +| **Invariant** | `Always`: a polling assertion in the workload — `SELECT offset_known, offset_committed FROM mz_internal.mz_source_statistics_per_worker WHERE id = ?` — invariant `offset_known >= offset_committed`. Mirror as an `assert_always!` inside the statistics update path in `src/storage/src/statistics.rs`. | +| **Antithesis Angle** | Clusterd restart resets `offset_known` to broker-reported watermark while `offset_committed` is restored from persist. If the restoration order is wrong, the invariant flips. Direct regression target for commit 3e32df1f69. | +| **Why It Matters** | The statistics view is consumed by users and by operational tooling to compute lag. A regression in causality makes lag metrics meaningless and is the kind of bug that survives unit tests but fails under adversarial timing. | diff --git a/test/antithesis/scratchbook/property-relationships.md b/test/antithesis/scratchbook/property-relationships.md index 4df508acd6f03..b8b250fc37233 100644 --- a/test/antithesis/scratchbook/property-relationships.md +++ b/test/antithesis/scratchbook/property-relationships.md @@ -48,9 +48,48 @@ Both test the 0DT deployment pipeline. `deployment-lag-detection` is a prerequis **Suspected dominance**: `deployment-promotion-safety` is stronger — it requires both lag detection and correct fencing. `deployment-lag-detection` is a liveness check on a subsystem of the promotion pipeline. +## Cluster 7: Kafka Source — User-Visible Ingestion Correctness + +**Properties**: `kafka-source-no-data-loss`, `kafka-source-no-data-duplication`, `kafka-source-frontier-monotonic`, `kafka-source-survives-broker-fault`, `kafka-source-survives-clusterd-restart` + +End-to-end Kafka source ingestion contract observable from the workload side. `kafka-source-no-data-loss` and `kafka-source-no-data-duplication` are the inverse-pair safety/liveness checks: every produced message must show up *exactly once*. The two recovery properties (`survives-broker-fault`, `survives-clusterd-restart`) exercise the same contract under different fault classes. `kafka-source-frontier-monotonic` is the lower-level safety property that both no-loss and no-duplication depend on. + +**Suspected dominance**: `kafka-source-frontier-monotonic` underpins both `no-data-loss` and `no-data-duplication` — if the persist shard upper goes backwards, both higher-level properties fail. `survives-clusterd-restart` strictly implies `survives-broker-fault` for the recovery code path (clusterd restart triggers all the same rehydration logic plus more), but the two stress different fault classes. + +## Cluster 8: UPSERT Envelope — Per-Key Semantics + +**Properties**: `upsert-key-reflects-latest-value`, `upsert-tombstone-removes-key`, `upsert-state-rehydrates-correctly`, `upsert-decode-error-retractable` + +The user-visible UPSERT contract. `upsert-key-reflects-latest-value` is the headline: latest produced value per key wins. `upsert-tombstone-removes-key` is the special-case for `None` values. `upsert-state-rehydrates-correctly` is the post-crash version of `latest-value`. `upsert-decode-error-retractable` is the error-recovery half of the contract — bad messages can be retracted. + +**Suspected dominance**: `upsert-state-rehydrates-correctly` implies `upsert-key-reflects-latest-value` in steady state (rehydration produces the right state, and that state is what subsequent operations operate on). `upsert-tombstone-removes-key` is a special case of `upsert-key-reflects-latest-value` (the "last produced was null" case). `upsert-decode-error-retractable` is independent. + +## Cluster 9: UPSERT Operator Internals — SUT-Side Asserts + +**Properties**: `upsert-no-internal-panic`, `upsert-state-consolidation-wellformed`, `upsert-ensure-decoded-called-before-access` + +Operator-internal correctness backbone for the UPSERT envelope. All three properties are about converting existing `panic!`/`assert!` sites in the upsert code into Antithesis-reportable assertions. `upsert-state-consolidation-wellformed` is the math-correctness check (XOR/checksum invariants in `ensure_decoded`); `upsert-ensure-decoded-called-before-access` is the type-state protocol check on `StateValue` accessors; `upsert-no-internal-panic` is the broader umbrella covering the diff-positive / commands-state / snapshot-completion guards. + +**Suspected dominance**: `upsert-state-consolidation-wellformed` is the deepest signal — a trip there indicates upstream code already failed to preserve some invariant. `upsert-no-internal-panic`'s `assert!(diff.is_positive())` family catches a similar class of upstream-bug-evidence higher up the stack. + +## Cluster 10: Kafka Source Internals — SUT-Side Asserts + +**Properties**: `kafka-source-no-internal-panic`, `remap-shard-antichain-wellformed`, `reclock-mint-eventually-succeeds`, `offset-known-not-below-committed` + +Reclock and source-reader operator-internal correctness. `remap-shard-antichain-wellformed` is the load-bearing invariant for the entire reclocking subsystem; `reclock-mint-eventually-succeeds` is its liveness companion. `kafka-source-no-internal-panic` is the umbrella for the explicit reader asserts. `offset-known-not-below-committed` is a much narrower statistics-causality check. + +**Suspected dominance**: `remap-shard-antichain-wellformed` underpins everything in Cluster 7 — a malformed remap antichain corrupts the resume frontier, which breaks both data-loss and data-duplication properties at the next restart. + ## Cross-Cluster Connections - `epoch-fencing-prevents-split-brain` (Cluster 2) protects `catalog-recovery-consistency` (Cluster 3) — fencing ensures only one writer during recovery - `persist-cas-monotonicity` (Cluster 1) underpins `catalog-recovery-consistency` (Cluster 3) — catalog is stored in persist, so CaS correctness is a prerequisite - `strict-serializable-reads` (Cluster 4) depends on `epoch-fencing-prevents-split-brain` (Cluster 2) — split-brain would allow inconsistent timestamp assignments - `idempotent-write-under-indeterminate` (Cluster 1) protects `storage-command-replay-idempotent` (Cluster 3) — storage ingestion uses persist writes, so idempotency matters for both +- `persist-cas-monotonicity` (Cluster 1) underpins `kafka-source-frontier-monotonic` (Cluster 7) — frontier monotonicity at the source level is a direct consequence of CaS monotonicity at the persist level +- `storage-command-replay-idempotent` (Cluster 3) supports `kafka-source-survives-clusterd-restart` (Cluster 7) — correct command replay is required for source recovery to be idempotent +- `idempotent-write-under-indeterminate` (Cluster 1) supports `kafka-source-no-data-duplication` (Cluster 7) — the no-duplicate-write guarantee at the persist level is what makes no-data-duplication observable at the source level +- `remap-shard-antichain-wellformed` (Cluster 10) underpins `kafka-source-no-data-loss` and `kafka-source-no-data-duplication` (Cluster 7) — a malformed remap antichain breaks the resume frontier across restart +- `upsert-state-consolidation-wellformed` (Cluster 9) underpins `upsert-state-rehydrates-correctly` (Cluster 8) — if the consolidating math is wrong, rehydration is wrong +- `source-ingestion-progress` (Cluster 4, pre-existing) is now subsumed by `kafka-source-no-data-loss` (Cluster 7) for Kafka specifically; `source-ingestion-progress` remains relevant for non-Kafka sources (Postgres CDC, MySQL CDC, generators) +- `mv-reflects-source-updates` (Cluster 4) depends on every Cluster 7 and Cluster 8 property — MVs over Kafka sources inherit those sources' correctness diff --git a/test/antithesis/scratchbook/sut-analysis.md b/test/antithesis/scratchbook/sut-analysis.md index a0ff7561eed5e..c38442d9d96c0 100644 --- a/test/antithesis/scratchbook/sut-analysis.md +++ b/test/antithesis/scratchbook/sut-analysis.md @@ -215,3 +215,84 @@ Materialize is organized into three logical layers that run as separate processe - What is the preferred metadata store for Antithesis testing — CockroachDB or PostgreSQL? - Should we test with multiple compute replicas or single replica? - Are there specific failure scenarios the Materialize team wants prioritized? + +## Appendix A: Kafka Source Ingestion (Detail) + +Added 2026-05-11 in response to scoping toward Kafka source properties (append-only + UPSERT envelope). + +### Pipeline shape + +`KafkaSourceReader` → `ReclockOperator` → (optional `decode`) → (optional `upsert` operator) → `persist_sink`. + +The dataflow is rendered in `src/storage/src/render/sources.rs`. The reader and metadata-fetcher are constructed by `SourceRender for KafkaSourceConnection` in `src/storage/src/source/kafka.rs`. Reclocking is in `src/storage/src/source/reclock.rs` plus `reclock/compat.rs` (the persist-backed remap handle). UPSERT logic is in `src/storage/src/upsert.rs` (classic) and `src/storage/src/upsert_continual_feedback.rs` / `upsert_continual_feedback_v2.rs` (continual-feedback variants). + +### Source-time vs into-time + +* **Source time** for Kafka is `Partitioned, MzOffset>` (`mz_storage_types::sources::kafka`). The frontier is a multi-partition antichain. +* **Into time** is Materialize's `mz_repr::Timestamp` (ms since epoch). The mapping from source time → into time is the *remap shard*: a persist shard whose contents accumulate to a well-formed `Antichain` at every into-time. See `ReclockOperator` doc comment: "for any time `IntoTime` the remap collection accumulates into an Antichain where each `FromTime` timestamp has frequency `1`." +* On startup the remap operator loads existing bindings, downgrades to the recovered upper, then mints new bindings when `mint()` receives a probe. + +### Partition handling + +* Partition → worker assignment is round-robin by hash: `((source_id + partition_id) % worker_count) == worker_id` (`kafka.rs`). +* New partitions are picked up by the metadata fetcher and routed through reclocking. +* Per-partition offsets are tracked in `last_offsets`. Code-stated invariant: "if we see offset x, we have seen all offsets [0, x-1] that we are ever going to see" (kafka.rs near line 1005). +* Offsets that arrive `<=` `last_offset` are silently dropped (kafka.rs ~1158). This is the path that protects against rdkafka redelivery on reconnect. +* Negative offsets from an otherwise non-errored message cause `panic!` in `construct_source_message` (kafka.rs ~1193). + +### Append-only (NONE envelope) workload shape + +Decoded rows flow directly into `persist_sink` keyed by Materialize timestamp. Each `(partition, offset)` produces exactly one row (plus metadata columns if requested). There is no retraction unless an upstream EvalError occurs in a downstream operator. + +### UPSERT envelope + +`upsert_commands` (render/sources.rs) maps each `DecodeResult` into `(UpsertKey, Option, FromTime)`: + +* `UpsertKey` is a 32-byte SHA-256 digest of the key bytes; collisions are treated as impossible (probabilistic). +* `Some(value)` is an insert/update for `key`; `None` is a tombstone (delete). +* Key decode failures produce `UpsertError::KeyDecode`; null keys produce `UpsertError::NullKey`; value decode failures produce `UpsertError::Value`. These flow as `Err` values keyed by the (errored) key and can be *retracted* by a subsequent good `(key, value)` for the same key — this is the contract that makes "fix the bad message" recovery possible without dropping the source. + +The upsert operator (`upsert_classic` in `upsert.rs`) consults a state store (`UpsertStateBackend`) for the prior value before emitting updates. Two backends ship: + +* `InMemoryHashMap` — `BTreeMap`. Lost on restart. +* `RocksDB` — persistent, with a merge operator. Bug history shows the merge operator must always return `Some` or RocksDB aborts the process (commit 0d8d740b47). + +State is reconstructed on restart by replaying the persist *feedback* stream (the output of the upsert operator's previous incarnation) up to the resume frontier. The operator passes through a *snapshot* phase that drains all feedback values for keys at or below the resume frontier, then transitions to normal mint-on-input mode. + +Key invariants stated in code: + +* `assert!(diff.is_positive(), "invalid upsert input")` (upsert.rs:541; mirrored in `upsert_continual_feedback*.rs`) — the upsert operator never sees retractions on its input; only inserts/tombstones. +* `panic!("key missing from commands_state")` (upsert.rs:636) — the operator's internal dedup table must always contain a key it is about to emit for; missing key is a structural invariant violation. +* Order-key monotonicity within a key is enforced by `consolidate_snapshot_chunk` / `drain_staged_input`. A regression here previously caused a panic that was "as close to data loss as possible" (commit f177db8286, issue materialize#26655). The fix skips violating updates rather than panicking. +* In continual-feedback v2: `assert!(diff.is_positive())` again (v2:315) plus `unreachable!()` on `(None, None)` from joined prior/new state (v2:483) and an empty-output assertion in tests (v2:957). + +### Reclock invariants and failure modes + +* `compare_and_append` on the remap shard can return `UpperMismatch` if a racing writer (e.g. across restart) has advanced the shard. `ReclockOperator::mint` retries by `sync()`-ing and re-minting (reclock.rs:160-166). +* `panic!("compare_and_append failed: {invalid_use}")` in `reclock/compat.rs:306` catches genuinely invalid persist calls (vs. retryable upper mismatch). +* Reclock's cached `upper` has a known staleness pitfall (commit e3805ad790, issue database-issues#8698) — fixed by always fetching the recent upper for `as_of` calculation. + +### Statistics and progress signals + +`statistics.rs` reports per-source counters that have correctness invariants of their own: + +* `offset_known >= offset_committed` (commit 3e32df1f69 enforces clamping after a regression bug). +* `snapshot_records_known >= snapshot_records_staged`, both decrease to zero (clear) at end of snapshot. + +These are user-visible numbers and form weak but easily-checkable correctness signals from the workload side. + +### Failure-prone areas relevant to Antithesis + +| Area | Risk | Code | +|------|------|------| +| Negative offset from rdkafka | hard panic | kafka.rs:1193 | +| Late offset on reconnect | silent drop (correct behavior, but check via `assert_sometimes!(saw_late_offset)`) | kafka.rs:1158 | +| Topic recreated with fewer offsets | previously panicked on capability downgrade (commit 99ad668af5) | source_reader_pipeline / kafka.rs | +| Upsert key with timestamp regression | previously panicked (commit f177db8286) | upsert.rs:475-487 | +| RocksDB merge returning `None` | SIGABRT (commit 0d8d740b47) | upsert/rocksdb.rs | +| Reclock `compare_and_append` UpperMismatch retry loop | unbounded retry, can block forever under persist outage | reclock.rs:160 | +| Multi-replica `drain_staged_input` double-pass | duplicate retractions (commit 1accbe28b3) | upsert_continual_feedback.rs | +| Persist sink cached upper across concurrent sinks | stale read leads to false errors (commit 505dc96aaa) | render/persist_sink.rs | +| Flag flip mid-append on persist sink | spurious `InvalidBatchBounds` (commit 68e1dfd86d) | render/persist_sink.rs | + +These are the seeds for the Kafka-specific property catalog in Category 7 of `property-catalog.md`. diff --git a/test/antithesis/workload/test/helper_kafka.py b/test/antithesis/workload/test/helper_kafka.py new file mode 100644 index 0000000000000..a9bf2eac600a1 --- /dev/null +++ b/test/antithesis/workload/test/helper_kafka.py @@ -0,0 +1,90 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Thin confluent-kafka producer wrapper for Antithesis drivers. + +Tracks the highest delivered offset per topic so drivers can poll Materialize +statistics for catchup. Retries delivery failures on partition; surfaces +permanent errors. +""" + +from __future__ import annotations + +import logging +import os +import threading +from dataclasses import dataclass, field + +from confluent_kafka import KafkaException, Producer +from confluent_kafka.admin import AdminClient, NewTopic + +LOG = logging.getLogger("antithesis.helper_kafka") + +BROKER = os.environ.get("KAFKA_BROKER", "kafka:9092") + + +@dataclass +class DeliveryTracker: + """Records highest delivered offset per (topic, partition) and any error.""" + + max_offset: dict[tuple[str, int], int] = field(default_factory=dict) + last_error: KafkaException | None = None + _lock: threading.Lock = field(default_factory=threading.Lock) + + def callback(self, err, msg): + if err is not None: + with self._lock: + self.last_error = KafkaException(err) + LOG.warning("kafka delivery error: %s", err) + return + key = (msg.topic(), msg.partition()) + with self._lock: + existing = self.max_offset.get(key, -1) + if msg.offset() > existing: + self.max_offset[key] = msg.offset() + + def topic_max_offset(self, topic: str) -> int: + with self._lock: + offsets = [o for (t, _), o in self.max_offset.items() if t == topic] + return max(offsets) if offsets else -1 + + +def make_producer(client_id: str | None = None) -> tuple[Producer, DeliveryTracker]: + """Construct a Producer with a fresh DeliveryTracker.""" + config: dict[str, object] = { + "bootstrap.servers": BROKER, + "linger.ms": 5, + "enable.idempotence": True, + "acks": "all", + } + if client_id: + config["client.id"] = client_id + return Producer(config), DeliveryTracker() + + +def ensure_topic(topic: str, num_partitions: int = 1) -> None: + """Create the topic if it doesn't already exist. No-op on race with auto-create.""" + admin = AdminClient({"bootstrap.servers": BROKER}) + existing = admin.list_topics(timeout=10).topics + if topic in existing: + return + LOG.info("creating kafka topic %s with %d partition(s)", topic, num_partitions) + futures = admin.create_topics( + [NewTopic(topic, num_partitions=num_partitions, replication_factor=1)] + ) + for t, fut in futures.items(): + try: + fut.result(timeout=30) + except KafkaException as exc: + # TOPIC_ALREADY_EXISTS = 36 + err = exc.args[0] if exc.args else None + if err is not None and getattr(err, "code", lambda: None)() == 36: + LOG.info("kafka topic %s raced with auto-create; continuing", t) + continue + raise diff --git a/test/antithesis/workload/test/helper_pg.py b/test/antithesis/workload/test/helper_pg.py new file mode 100644 index 0000000000000..d90babf162baf --- /dev/null +++ b/test/antithesis/workload/test/helper_pg.py @@ -0,0 +1,120 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Resilient Materialize/pgwire connection helpers for Antithesis drivers. + +The workload runs under active fault injection. Every call retries network and +admission errors transparently; everything else propagates. +""" + +from __future__ import annotations + +import logging +import os +import time +from collections.abc import Iterator, Sequence +from contextlib import contextmanager +from typing import Any + +import psycopg + +LOG = logging.getLogger("antithesis.helper_pg") + +PGHOST = os.environ.get("PGHOST", "materialized") +PGPORT = int(os.environ.get("PGPORT", "6875")) +PGUSER = os.environ.get("PGUSER", "materialize") +PGDATABASE = os.environ.get("PGDATABASE", "materialize") + +# Retry tuning. Antithesis injects partitions and node hangs; conservative bounds +# keep drivers progressing without masking real correctness signals. +_CONNECT_TIMEOUT_S = 5 +_RETRY_BUDGET_S = 60 +_RETRY_INITIAL_S = 0.1 +_RETRY_MAX_S = 2.0 + + +def _retryable(exc: BaseException) -> bool: + if isinstance(exc, psycopg.OperationalError): + return True + # psycopg wraps server-side admin shutdowns as InterfaceError on next op. + if isinstance(exc, psycopg.InterfaceError): + return True + return False + + +@contextmanager +def connect(autocommit: bool = True) -> Iterator[psycopg.Connection]: + """Yield a connection, retrying transient failures up to RETRY_BUDGET_S.""" + deadline = time.monotonic() + _RETRY_BUDGET_S + backoff = _RETRY_INITIAL_S + while True: + try: + conn = psycopg.connect( + host=PGHOST, + port=PGPORT, + user=PGUSER, + dbname=PGDATABASE, + connect_timeout=_CONNECT_TIMEOUT_S, + autocommit=autocommit, + ) + break + except Exception as exc: # noqa: BLE001 + if not _retryable(exc) or time.monotonic() > deadline: + raise + LOG.info("pg connect retrying after %s; backoff=%.2fs", exc, backoff) + time.sleep(backoff) + backoff = min(backoff * 2, _RETRY_MAX_S) + try: + yield conn + finally: + try: + conn.close() + except Exception: # noqa: BLE001 + pass + + +def execute_retry(sql: str, params: Sequence[Any] | None = None) -> None: + """Execute a statement, retrying transient errors. No result returned.""" + deadline = time.monotonic() + _RETRY_BUDGET_S + backoff = _RETRY_INITIAL_S + while True: + try: + with connect() as conn, conn.cursor() as cur: + cur.execute(sql, params or ()) + return + except Exception as exc: # noqa: BLE001 + if not _retryable(exc) or time.monotonic() > deadline: + raise + LOG.info("pg execute retrying after %s", exc) + time.sleep(backoff) + backoff = min(backoff * 2, _RETRY_MAX_S) + + +def query_retry(sql: str, params: Sequence[Any] | None = None) -> list[tuple[Any, ...]]: + """Run a query and return all rows, retrying transient errors.""" + deadline = time.monotonic() + _RETRY_BUDGET_S + backoff = _RETRY_INITIAL_S + while True: + try: + with connect() as conn, conn.cursor() as cur: + cur.execute(sql, params or ()) + return list(cur.fetchall()) + except Exception as exc: # noqa: BLE001 + if not _retryable(exc) or time.monotonic() > deadline: + raise + LOG.info("pg query retrying after %s", exc) + time.sleep(backoff) + backoff = min(backoff * 2, _RETRY_MAX_S) + + +def query_one_retry( + sql: str, params: Sequence[Any] | None = None +) -> tuple[Any, ...] | None: + rows = query_retry(sql, params) + return rows[0] if rows else None diff --git a/test/antithesis/workload/test/helper_quiet.py b/test/antithesis/workload/test/helper_quiet.py new file mode 100644 index 0000000000000..adb4f9ead3e6d --- /dev/null +++ b/test/antithesis/workload/test/helper_quiet.py @@ -0,0 +1,38 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Wrapper around the Antithesis ANTITHESIS_STOP_FAULTS binary. + +Outside Antithesis (e.g. snouty local validate), the env var is unset and this +becomes a no-op so the workload still runs end-to-end. +""" + +from __future__ import annotations + +import logging +import os +import subprocess + +LOG = logging.getLogger("antithesis.helper_quiet") + + +def request_quiet_period(seconds: int) -> bool: + """Request that Antithesis pause all faults for `seconds`. + + Returns True if the request was issued, False if not in Antithesis. Either + way callers must still poll for the system to stabilize — the binary + returns immediately and the actual quiet window unfolds asynchronously. + """ + binary = os.environ.get("ANTITHESIS_STOP_FAULTS") + if not binary: + LOG.info("ANTITHESIS_STOP_FAULTS not set; skipping quiet-period request") + return False + LOG.info("requesting %ds quiet period via %s", seconds, binary) + subprocess.run([binary, str(seconds)], check=False) + return True diff --git a/test/antithesis/workload/test/helper_random.py b/test/antithesis/workload/test/helper_random.py new file mode 100644 index 0000000000000..cb749227d6f17 --- /dev/null +++ b/test/antithesis/workload/test/helper_random.py @@ -0,0 +1,64 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Deterministic randomness for Antithesis drivers. + +All driver randomness must go through the Antithesis SDK so timelines replay +deterministically. Outside Antithesis we fall back to the stdlib `random` with a +fixed-but-arbitrary seed per process so local runs are not flaky. +""" + +from __future__ import annotations + +import os +import random as _stdlib_random +from collections.abc import Sequence +from typing import TypeVar + +try: + from antithesis import random as _ar + + _ANTITHESIS = True +except ImportError: + _ANTITHESIS = False + +T = TypeVar("T") + +# A stable per-process seed so local snouty validate runs are deterministic +# within one process but pick a different sequence per process invocation. +_FALLBACK = _stdlib_random.Random(int.from_bytes(os.urandom(8), "little")) + + +def random_u64() -> int: + if _ANTITHESIS: + return _ar.get_random() + return _FALLBACK.getrandbits(64) + + +def random_choice(seq: Sequence[T]) -> T: + if not seq: + raise ValueError("random_choice on empty sequence") + if _ANTITHESIS: + return _ar.random_choice(list(seq)) + return _FALLBACK.choice(seq) + + +def random_int(low: int, high: int) -> int: + """Inclusive on both ends.""" + if low > high: + raise ValueError("low > high") + span = high - low + 1 + return low + (random_u64() % span) + + +def random_bool(true_prob: float) -> bool: + if not 0.0 <= true_prob <= 1.0: + raise ValueError("true_prob out of range") + # Use 16 bits of entropy to avoid floating-point quirks under replay. + return (random_u64() & 0xFFFF) < int(true_prob * 0x10000) diff --git a/test/antithesis/workload/test/helper_source_stats.py b/test/antithesis/workload/test/helper_source_stats.py new file mode 100644 index 0000000000000..54af7f0e29866 --- /dev/null +++ b/test/antithesis/workload/test/helper_source_stats.py @@ -0,0 +1,86 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Catchup polling against `mz_internal.mz_source_statistics`. + +Used by drivers to wait until a Kafka source has durably ingested at least +some target offset (typically the maximum produced offset). All durations are +budgeted; callers handle timeouts. +""" + +from __future__ import annotations + +import logging +import time + +from helper_pg import query_one_retry + +LOG = logging.getLogger("antithesis.helper_source_stats") + + +def offset_committed(source_name: str) -> int | None: + """Return the maximum offset_committed for `source_name`, or None. + + `mz_source_statistics.offset_committed` is the durably-ingested upstream + offset, aggregated across replicas in the view. Returns None if the + statistics row does not exist yet (very early in source lifetime) so + callers can distinguish "not initialized" from "still behind." + """ + row = query_one_retry( + """ + SELECT MAX(ss.offset_committed)::bigint + FROM mz_internal.mz_source_statistics ss + JOIN mz_sources s ON s.id = ss.id + WHERE s.name = %s + """, + (source_name,), + ) + if row is None or row[0] is None: + return None + return int(row[0]) + + +def wait_for_catchup( + source_name: str, + target_offset: int, + timeout_s: float = 60.0, + poll_interval_s: float = 0.5, +) -> bool: + """Wait until offset_committed for `source_name` reaches `target_offset`. + + Returns True if catchup completed within `timeout_s`, False on timeout. + """ + deadline = time.monotonic() + timeout_s + last_seen: int | None = None + while time.monotonic() < deadline: + observed = offset_committed(source_name) + if observed is not None and observed >= target_offset: + LOG.info( + "source %s caught up: observed=%d target=%d", + source_name, + observed, + target_offset, + ) + return True + if observed != last_seen: + LOG.info( + "source %s waiting for catchup: observed=%s target=%d", + source_name, + observed, + target_offset, + ) + last_seen = observed + time.sleep(poll_interval_s) + LOG.warning( + "source %s catchup timeout: observed=%s target=%d", + source_name, + last_seen, + target_offset, + ) + return False diff --git a/test/antithesis/workload/test/helper_upsert_source.py b/test/antithesis/workload/test/helper_upsert_source.py new file mode 100644 index 0000000000000..59332b28d64e9 --- /dev/null +++ b/test/antithesis/workload/test/helper_upsert_source.py @@ -0,0 +1,54 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Idempotent setup for the Antithesis UPSERT-envelope Kafka source. + +Used by all drivers that exercise UPSERT semantics. Topic is auto-created by +the Kafka broker on first produce; the source/connection are created at most +once across all drivers (CREATE ... IF NOT EXISTS). +""" + +from __future__ import annotations + +import logging +import os + +from helper_pg import execute_retry + +LOG = logging.getLogger("antithesis.helper_upsert_source") + +KAFKA_BROKER = os.environ.get("KAFKA_BROKER", "kafka:9092") +CLUSTER = os.environ.get("MZ_ANTITHESIS_CLUSTER", "antithesis_cluster") + +CONNECTION_NAME = "antithesis_kafka_conn" +TOPIC_UPSERT_TEXT = "antithesis-upsert-text" +SOURCE_UPSERT_TEXT = "upsert_text_src" + + +def ensure_kafka_connection() -> None: + execute_retry( + f"CREATE CONNECTION IF NOT EXISTS {CONNECTION_NAME} " + f"TO KAFKA (BROKER '{KAFKA_BROKER}', SECURITY PROTOCOL = 'PLAINTEXT')" + ) + + +def ensure_upsert_text_source() -> None: + """Create the upsert-envelope source over a text key/value Kafka topic. + + The resulting source has columns `key TEXT NOT NULL` and `text TEXT`. + """ + ensure_kafka_connection() + execute_retry( + f"CREATE SOURCE IF NOT EXISTS {SOURCE_UPSERT_TEXT} " + f"IN CLUSTER {CLUSTER} " + f"FROM KAFKA CONNECTION {CONNECTION_NAME} (TOPIC '{TOPIC_UPSERT_TEXT}') " + f"KEY FORMAT TEXT VALUE FORMAT TEXT " + f"ENVELOPE UPSERT" + ) + LOG.info("upsert source %s ready (topic=%s)", SOURCE_UPSERT_TEXT, TOPIC_UPSERT_TEXT) diff --git a/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py b/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py new file mode 100755 index 0000000000000..7aa54acb3192d --- /dev/null +++ b/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py @@ -0,0 +1,211 @@ +#!/usr/bin/env python3 +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Antithesis driver for property `upsert-key-reflects-latest-value`. + +For each key produced to a Kafka UPSERT-envelope source, after a quiet period +that lets Materialize catch up, the source's row for that key must reflect the +last value produced — or be absent if the last message was a tombstone. + +Each invocation: + 1. Ensures the upsert source exists (idempotent CREATE ... IF NOT EXISTS). + 2. Picks a per-invocation key prefix so concurrent driver instances don't + interfere with each other's expected-state model. + 3. Produces a deterministic mix of upserts and tombstones, tracking the + local "what should the source say" model. + 4. Requests an Antithesis quiet period and waits for offset_committed to + reach the highest produced offset. + 5. For every tracked key, asserts that what's in the source matches the + local model. Live keys use one assertion message, tombstoned keys use + another, so triage can distinguish the two failure modes. + +This is a `parallel_driver_` — Antithesis runs many concurrent instances and +each one assigns itself a fresh prefix from deterministic randomness, so +multiple drivers exercise the source without colliding. +""" + +from __future__ import annotations + +import logging +import sys + +import helper_random +from helper_kafka import make_producer +from helper_pg import query_one_retry +from helper_quiet import request_quiet_period +from helper_source_stats import wait_for_catchup +from helper_upsert_source import ( + SOURCE_UPSERT_TEXT, + TOPIC_UPSERT_TEXT, + ensure_upsert_text_source, +) + +from antithesis.assertions import always, sometimes + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" +) +LOG = logging.getLogger("driver.upsert_latest_value") + +# Knobs. Kept small per-invocation because Antithesis launches the driver many +# times; total coverage comes from re-invocations, not from one huge run. +PRODUCES_PER_INVOCATION = 40 +DISTINCT_KEYS = 8 # small key space so we re-write the same key often +DISTINCT_VALUES = 16 +TOMBSTONE_PROB = 0.15 + +QUIET_PERIOD_S = 20 +CATCHUP_TIMEOUT_S = 60.0 + + +def _produce(producer, tracker, topic: str, key: str, value: str | None) -> None: + """Encode value=None as a Kafka tombstone (null payload).""" + payload = None if value is None else value.encode("utf-8") + producer.produce( + topic=topic, + key=key.encode("utf-8"), + value=payload, + on_delivery=tracker.callback, + ) + + +def _select_value_for_key(key: str) -> tuple[bool, str | None]: + """Return (found, value) for the single source row matching `key`. + + Returns (False, None) when no row exists (the tombstone case for an + UPSERT source). Returns (True, value) when exactly one row exists. + Raises if more than one row exists — that would mean the source is + multi-rowed per key and violates the UPSERT contract itself, which is + out of scope for this property and should be caught by + `kafka-source-no-data-duplication`. + """ + row = query_one_retry( + f"SELECT count(*)::bigint, max(text) FROM {SOURCE_UPSERT_TEXT} WHERE key = %s", + (key,), + ) + if row is None: + return False, None + count, value = row + if count == 0: + return False, None + if count != 1: + raise RuntimeError( + f"upsert source has {count} rows for key {key!r}; this driver assumes " + "the per-key uniqueness property holds" + ) + return True, value + + +def main() -> int: + ensure_upsert_text_source() + + # Per-invocation prefix isolates this driver's keys from other concurrent + # drivers and from previous invocations of this same driver. + prefix = f"p{helper_random.random_u64():016x}" + LOG.info("driver starting; prefix=%s", prefix) + + producer, tracker = make_producer(client_id=f"antithesis-{prefix}") + + # Local "what should the source say" model for this invocation's keys. + # Value of None means "the last message was a tombstone". + expected: dict[str, str | None] = {} + + keys = [f"{prefix}-k{i}" for i in range(DISTINCT_KEYS)] + for _ in range(PRODUCES_PER_INVOCATION): + key = helper_random.random_choice(keys) + if helper_random.random_bool(TOMBSTONE_PROB): + _produce(producer, tracker, TOPIC_UPSERT_TEXT, key, None) + expected[key] = None + else: + value = f"v{helper_random.random_int(0, DISTINCT_VALUES - 1):04d}" + _produce(producer, tracker, TOPIC_UPSERT_TEXT, key, value) + expected[key] = value + producer.poll(0) + + # Flush all pending deliveries. We poll callbacks while flushing so the + # tracker reflects the true max produced offset. + pending = producer.flush(timeout=30) + if pending > 0 or tracker.last_error is not None: + # Under sustained fault injection we cannot prove which of the just- + # produced messages Kafka actually accepted, so `expected` may name + # values the source never sees. Bail out before running safety + # assertions — fault-induced delivery loss is not what this property + # is testing. The catchup `sometimes()` is also skipped because we + # have no trustworthy target offset. + LOG.info( + "skipping assertions: producer.flush pending=%d last_error=%s", + pending, + tracker.last_error, + ) + return 0 + + max_produced = tracker.topic_max_offset(TOPIC_UPSERT_TEXT) + if max_produced < 0: + LOG.info("no messages confirmed delivered this invocation; exiting cleanly") + return 0 + + # Now ask Antithesis to pause faults and wait for Materialize to catch up. + request_quiet_period(QUIET_PERIOD_S) + caught_up = wait_for_catchup( + SOURCE_UPSERT_TEXT, max_produced, timeout_s=CATCHUP_TIMEOUT_S + ) + + # Liveness signal: at least one invocation should reach catchup. If this + # never fires across an entire run, the safety assertions below would be + # vacuous and the run is uninteresting. + sometimes( + caught_up, + "upsert: source caught up to produced offsets after quiet period", + {"source": SOURCE_UPSERT_TEXT, "target_offset": max_produced}, + ) + + if not caught_up: + # Don't run the per-key safety assertions on stale data — that would + # blame the property for a slow catchup that's a separate concern. + LOG.info("catchup did not complete in budget; skipping per-key assertions") + return 0 + + # Per-key safety assertions. Two distinct messages so triage reports tell + # us *which* invariant broke: a value mismatch or a tombstone resurrection. + for key, want in expected.items(): + found, observed = _select_value_for_key(key) + + if want is None: + # The last produced message for this key was a tombstone; the + # source must not contain a row for it. + always( + not found, + "upsert: tombstoned key has no row in source", + { + "source": SOURCE_UPSERT_TEXT, + "key": key, + "observed_value": observed, + }, + ) + else: + # Live key: there must be exactly one row, with the latest value. + always( + found and observed == want, + "upsert: SELECT for key matches latest produced value", + { + "source": SOURCE_UPSERT_TEXT, + "key": key, + "expected_value": want, + "observed_present": found, + "observed_value": observed, + }, + ) + + LOG.info("driver done; asserted on %d keys", len(expected)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From 7033cce66dd2787d9bce5e9fca0655846f08a9a3 Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Mon, 11 May 2026 17:23:54 -0400 Subject: [PATCH 19/65] src/storage: wrap kafka source + upsert panic sites with antithesis-sdk assertions --- Cargo.lock | 47 ++++++++++ Cargo.toml | 1 + src/storage/Cargo.toml | 1 + src/storage/src/source/kafka.rs | 36 +++++++- src/storage/src/source/reclock/compat.rs | 10 +- src/storage/src/upsert.rs | 15 +++ src/storage/src/upsert/types.rs | 91 +++++++++++++++++-- src/storage/src/upsert_continual_feedback.rs | 11 +++ .../src/upsert_continual_feedback_v2.rs | 15 ++- .../scratchbook/property-catalog.md | 4 + 10 files changed, 221 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 86851059fce5e..2f4eed40b37c2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -172,6 +172,22 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "antithesis_sdk" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18dbd97a5b6c21cc9176891cf715f7f0c273caf3959897f43b9bd1231939e675" +dependencies = [ + "libc", + "libloading", + "linkme", + "once_cell", + "rand 0.8.5", + "rustc_version_runtime", + "serde", + "serde_json", +] + [[package]] name = "anyhow" version = "1.0.102" @@ -5120,6 +5136,26 @@ dependencies = [ "linked-hash-map", ] +[[package]] +name = "linkme" +version = "0.3.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e83272d46373fb8decca684579ac3e7c8f3d71d4cc3aa693df8759e260ae41cf" +dependencies = [ + "linkme-impl", +] + +[[package]] +name = "linkme-impl" +version = "0.3.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32d59e20403c7d08fe62b4376edfe5c7fb2ef1e6b1465379686d0f21c8df444b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "linux-raw-sys" version = "0.4.15" @@ -7942,6 +7978,7 @@ dependencies = [ name = "mz-storage" version = "0.0.0" dependencies = [ + "antithesis_sdk", "anyhow", "arrow", "arrow-ipc", @@ -10661,6 +10698,16 @@ dependencies = [ "semver", ] +[[package]] +name = "rustc_version_runtime" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dd18cd2bae1820af0b6ad5e54f4a51d0f3fcc53b05f845675074efcc7af071d" +dependencies = [ + "rustc_version", + "semver", +] + [[package]] name = "rustix" version = "0.38.44" diff --git a/Cargo.toml b/Cargo.toml index 8ba97cb61b290..5d38ff3d8124b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -263,6 +263,7 @@ ahash = { version = "0.8.12", default-features = false } aho-corasick = "1.1.4" allocation-counter = "0" anyhow = "1.0.102" +antithesis_sdk = "0.2.8" array-concat = "0.5.5" arrayvec = "0.7.6" arrow = { version = "57", default-features = false } diff --git a/src/storage/Cargo.toml b/src/storage/Cargo.toml index f96d9991511dc..2e7f4f4a37ab7 100644 --- a/src/storage/Cargo.toml +++ b/src/storage/Cargo.toml @@ -15,6 +15,7 @@ bench = false [dependencies] anyhow.workspace = true +antithesis_sdk.workspace = true async-stream.workspace = true async-trait.workspace = true aws-credential-types.workspace = true diff --git a/src/storage/src/source/kafka.rs b/src/storage/src/source/kafka.rs index 60ab8b8928058..2f6e8d28f960e 100644 --- a/src/storage/src/source/kafka.rs +++ b/src/storage/src/source/kafka.rs @@ -14,6 +14,7 @@ use std::sync::Arc; use std::thread; use std::time::Duration; +use antithesis_sdk::{assert_always, assert_unreachable}; use anyhow::anyhow; use chrono::{DateTime, NaiveDateTime}; use differential_dataflow::{AsCollection, Hashable}; @@ -52,6 +53,7 @@ use rdkafka::statistics::Statistics; use rdkafka::topic_partition_list::Offset; use rdkafka::{ClientContext, Message, TopicPartitionList}; use serde::{Deserialize, Serialize}; +use serde_json::json; use timely::PartialOrder; use timely::container::CapacityContainerBuilder; use timely::dataflow::channels::pact::Pipeline; @@ -273,7 +275,13 @@ fn render_reader<'scope>( .iter() .map(|(_name, kind)| kind.clone()) .collect::>(), - _ => panic!("unexpected source export details: {:?}", details), + _ => { + assert_unreachable!( + "kafka: unexpected source export details", + &json!({"source_id": id.to_string()}) + ); + panic!("unexpected source export details: {:?}", details) + } }; let statistics = config @@ -888,6 +896,11 @@ fn render_reader<'scope>( } } // We can now put them back + assert_always!( + reader.partition_consumers.is_empty(), + "kafka: partition_consumers not drained at shutdown", + &json!({"remaining": reader.partition_consumers.len()}) + ); assert!(reader.partition_consumers.is_empty()); reader.partition_consumers = consumers; @@ -1139,6 +1152,20 @@ impl KafkaSourceReader { // Given the explicit consumer to partition assignment, we should never receive a message // for a partition for which we have no metadata + let partition_known = self + .last_offsets + .get(output_index) + .map(|m| m.contains_key(&partition)) + .unwrap_or(false); + assert_always!( + partition_known, + "kafka: partition missing from last_offsets", + &json!({ + "source_id": self.id.to_string(), + "partition": partition, + "output_index": output_index, + }) + ); assert!( self.last_offsets .get(output_index) @@ -1190,6 +1217,13 @@ fn construct_source_message( ) { let pid = msg.partition(); let Ok(offset) = u64::try_from(msg.offset()) else { + assert_unreachable!( + "kafka: negative offset from non-error message", + &json!({ + "partition": msg.partition(), + "raw_offset": msg.offset(), + }) + ); panic!( "got negative offset ({}) from otherwise non-error'd kafka message", msg.offset() diff --git a/src/storage/src/source/reclock/compat.rs b/src/storage/src/source/reclock/compat.rs index a260e2dfcf060..607bbc4c5e680 100644 --- a/src/storage/src/source/reclock/compat.rs +++ b/src/storage/src/source/reclock/compat.rs @@ -15,6 +15,7 @@ use std::rc::Rc; use std::sync::Arc; use std::time::Duration; +use antithesis_sdk::assert_unreachable; use anyhow::Context; use differential_dataflow::lattice::Lattice; use fail::fail_point; @@ -33,6 +34,7 @@ use mz_storage_client::util::remap_handle::{RemapHandle, RemapHandleReader}; use mz_storage_types::StorageDiff; use mz_storage_types::controller::CollectionMetadata; use mz_storage_types::sources::{SourceData, SourceTimestamp}; +use serde_json::json; use timely::order::{PartialOrder, TotalOrder}; use timely::progress::Timestamp; use timely::progress::frontier::Antichain; @@ -303,7 +305,13 @@ where *self.shared_write_frontier.borrow_mut() = new_upper; return result; } - Err(invalid_use) => panic!("compare_and_append failed: {invalid_use}"), + Err(invalid_use) => { + assert_unreachable!( + "reclock: compare_and_append InvalidUsage", + &json!({"error": invalid_use.to_string()}) + ); + panic!("compare_and_append failed: {invalid_use}") + } } } diff --git a/src/storage/src/upsert.rs b/src/storage/src/upsert.rs index cdc583d76b119..5c8922de4c022 100644 --- a/src/storage/src/upsert.rs +++ b/src/storage/src/upsert.rs @@ -15,6 +15,7 @@ use std::hash::{Hash, Hasher}; use std::path::PathBuf; use std::sync::Arc; +use antithesis_sdk::{assert_always, assert_unreachable}; use differential_dataflow::hashable::Hashable; use differential_dataflow::{AsCollection, VecCollection}; use futures::StreamExt; @@ -34,6 +35,7 @@ use mz_timely_util::builder_async::{ PressOnDropButton, }; use serde::{Deserialize, Serialize}; +use serde_json::json; use sha2::{Digest, Sha256}; use timely::dataflow::channels::pact::Exchange; use timely::dataflow::operators::{Capability, InputCapability, Operator}; @@ -538,6 +540,11 @@ fn stage_input( } stash.extend(data.drain(..).map(|((key, value, order), time, diff)| { + assert_always!( + diff.is_positive(), + "upsert: input diff positive (classic)", + &json!({"diff": diff.into_inner()}) + ); assert!(diff.is_positive(), "invalid upsert input"); (time, key, Reverse(order), value) })); @@ -633,6 +640,10 @@ async fn drain_staged_input( let mut command_state = if let Entry::Occupied(command_state) = commands_state.entry(key) { command_state } else { + assert_unreachable!( + "upsert: key missing from commands_state (classic)", + &json!({"source_id": source_config.id.to_string()}) + ); panic!("key missing from commands_state"); }; @@ -1028,5 +1039,9 @@ async fn process_upsert_state_error( let update = HealthStatusUpdate::halting(e.context(context).to_string_with_causes(), None); health_output.give(health_cap, (None, update)); std::future::pending::<()>().await; + assert_unreachable!( + "upsert: pending future returned (classic)", + &json!({"site": "process_upsert_state_error"}) + ); unreachable!("pending future never returns"); } diff --git a/src/storage/src/upsert/types.rs b/src/storage/src/upsert/types.rs index 2bf8270aa2c95..57a4b85033563 100644 --- a/src/storage/src/upsert/types.rs +++ b/src/storage/src/upsert/types.rs @@ -88,11 +88,13 @@ use std::num::Wrapping; use std::sync::Arc; use std::time::Instant; +use antithesis_sdk::{assert_always, assert_unreachable}; use bincode::Options; use itertools::Itertools; use mz_ore::error::ErrorExt; use mz_repr::{Diff, GlobalId}; use serde::{Serialize, de::DeserializeOwned}; +use serde_json::json; use crate::metrics::upsert::{UpsertMetrics, UpsertSharedMetrics}; use crate::statistics::SourceStatistics; @@ -294,6 +296,10 @@ impl StateValue { match self { Self::Value(value) => value, Self::Consolidating(_) => { + assert_unreachable!( + "upsert: into_decoded on Consolidating StateValue", + &json!({"accessor": "into_decoded"}) + ); panic!("called `into_decoded without calling `ensure_decoded`") } } @@ -366,6 +372,10 @@ impl StateValue { }), }), StateValue::Consolidating(_) => { + assert_unreachable!( + "upsert: into_provisional_value on Consolidating StateValue", + &json!({"accessor": "into_provisional_value"}) + ); panic!("called `into_provisional_value` without calling `ensure_decoded`") } } @@ -400,6 +410,10 @@ impl StateValue { }), }), StateValue::Consolidating(_) => { + assert_unreachable!( + "upsert: into_provisional_tombstone on Consolidating StateValue", + &json!({"accessor": "into_provisional_tombstone"}) + ); panic!("called `into_provisional_tombstone` without calling `ensure_decoded`") } } @@ -413,6 +427,10 @@ impl StateValue { _ => None, }, Self::Consolidating(_) => { + assert_unreachable!( + "upsert: provisional_order on Consolidating StateValue", + &json!({"accessor": "provisional_order"}) + ); panic!("called `provisional_order` without calling `ensure_decoded`") } } @@ -427,6 +445,10 @@ impl StateValue { _ => value.finalized.as_ref(), }, Self::Consolidating(_) => { + assert_unreachable!( + "upsert: provisional_value_ref on Consolidating StateValue", + &json!({"accessor": "provisional_value_ref"}) + ); panic!("called `provisional_value_ref` without calling `ensure_decoded`") } } @@ -437,6 +459,10 @@ impl StateValue { match self { Self::Value(v) => v.finalized, Self::Consolidating(_) => { + assert_unreachable!( + "upsert: into_finalized_value on Consolidating StateValue", + &json!({"accessor": "into_finalized_value"}) + ); panic!("called `into_finalized_value` without calling `ensure_decoded`") } } @@ -577,7 +603,13 @@ impl StateValue { *acc ^= val; } } - _ => panic!("`merge_update_state` called with non-consolidating state"), + _ => { + assert_unreachable!( + "upsert: merge_update_state on non-Consolidating state", + &json!({"site": "merge_update_state"}) + ); + panic!("`merge_update_state` called with non-consolidating state") + } } } @@ -618,29 +650,61 @@ impl StateValue { }) .expect("invalid upsert state"); // Truncation is fine (using `as`) as this is just a checksum + let want_checksum = seahash::hash(value) as i64; + assert_always!( + consolidating.checksum_sum.0 == want_checksum, + "upsert: consolidating checksum_sum mismatch (diff_sum=1)", + &json!({ + "source_id": source_id.to_string(), + "checksum_sum": consolidating.checksum_sum.0, + "expected_seahash": want_checksum, + }) + ); assert_eq!( - consolidating.checksum_sum.0, - // Hash the value, not the full buffer, which may have extra 0's - seahash::hash(value) as i64, + consolidating.checksum_sum.0, want_checksum, "invalid upsert state: checksum_sum does not match, state: {}, {}", - consolidating, - source_id, + consolidating, source_id, ); *self = Self::finalized_value(bincode_opts.deserialize(value).unwrap()); } 0 => { + assert_always!( + consolidating.len_sum.0 == 0, + "upsert: consolidating len_sum nonzero (diff_sum=0)", + &json!({ + "source_id": source_id.to_string(), + "len_sum": consolidating.len_sum.0, + }) + ); assert_eq!( consolidating.len_sum.0, 0, "invalid upsert state: len_sum is non-0, state: {}, {}", consolidating, source_id, ); + assert_always!( + consolidating.checksum_sum.0 == 0, + "upsert: consolidating checksum_sum nonzero (diff_sum=0)", + &json!({ + "source_id": source_id.to_string(), + "checksum_sum": consolidating.checksum_sum.0, + }) + ); assert_eq!( consolidating.checksum_sum.0, 0, "invalid upsert state: checksum_sum is non-0, state: {}, {}", consolidating, source_id, ); + let all_zero = consolidating.value_xor.iter().all(|&x| x == 0); + assert_always!( + all_zero, + "upsert: consolidating value_xor nonzero (diff_sum=0)", + &json!({ + "source_id": source_id.to_string(), + "value_xor_len": consolidating.value_xor.len(), + }) + ); assert!( - consolidating.value_xor.iter().all(|&x| x == 0), + all_zero, "invalid upsert state: value_xor not all 0s with 0 diff. \ Non-zero positions: {:?}, state: {}, {}", consolidating @@ -669,6 +733,15 @@ impl StateValue { ), Err(_) => "Err(UpsertValueError)".to_string(), }); + assert_unreachable!( + "upsert: consolidating diff_sum not in {0,1}", + &json!({ + "source_id": source_id.to_string(), + "diff_sum": other, + "value_byte_len": value_byte_len, + "decodable": decode_ok, + }) + ); panic!( "invalid upsert state: non 0/1 diff_sum: {}, state: {}, {}, \ key: {:?}, value_byte_len: {:?}, decodable: {:?}", @@ -1059,6 +1132,10 @@ where }); if completed && self.snapshot_completed { + assert_unreachable!( + "upsert: snapshot completion called twice", + &json!({"site": "consolidate_chunk"}) + ); panic!("attempted completion of already completed upsert snapshot") } diff --git a/src/storage/src/upsert_continual_feedback.rs b/src/storage/src/upsert_continual_feedback.rs index a4669d3a80099..5fb562a7aa08a 100644 --- a/src/storage/src/upsert_continual_feedback.rs +++ b/src/storage/src/upsert_continual_feedback.rs @@ -14,6 +14,7 @@ use std::cmp::Reverse; use std::fmt::Debug; use std::sync::Arc; +use antithesis_sdk::{assert_always, assert_unreachable}; use differential_dataflow::hashable::Hashable; use differential_dataflow::{AsCollection, VecCollection}; use indexmap::map::Entry; @@ -23,6 +24,7 @@ use mz_storage_types::errors::{DataflowError, EnvelopeError}; use mz_timely_util::builder_async::{ Event as AsyncEvent, OperatorBuilder as AsyncOperatorBuilder, PressOnDropButton, }; +use serde_json::json; use std::convert::Infallible; use timely::container::CapacityContainerBuilder; use timely::dataflow::StreamVec; @@ -623,6 +625,11 @@ fn stage_input( } stash.extend(data.drain(..).map(|((key, value, order), time, diff)| { + assert_always!( + diff.is_positive(), + "upsert: input diff positive (cf v1)", + &json!({"diff": diff.into_inner()}) + ); assert!(diff.is_positive(), "invalid upsert input"); (time, key, Reverse(order), value) })); @@ -797,6 +804,10 @@ where let mut command_state = if let Entry::Occupied(command_state) = commands_state.entry(key) { command_state } else { + assert_unreachable!( + "upsert: key missing from commands_state (cf v1)", + &json!({"source_id": source_config.id.to_string()}) + ); panic!("key missing from commands_state"); }; diff --git a/src/storage/src/upsert_continual_feedback_v2.rs b/src/storage/src/upsert_continual_feedback_v2.rs index 32de9e3770086..8560ffd614603 100644 --- a/src/storage/src/upsert_continual_feedback_v2.rs +++ b/src/storage/src/upsert_continual_feedback_v2.rs @@ -65,6 +65,7 @@ use std::fmt::Debug; use std::marker::PhantomData; use std::sync::Arc; +use antithesis_sdk::{assert_always, assert_unreachable}; use differential_dataflow::difference::{IsZero, Semigroup}; use differential_dataflow::hashable::Hashable; use differential_dataflow::lattice::Lattice; @@ -81,6 +82,7 @@ use mz_storage_types::errors::{DataflowError, EnvelopeError}; use mz_timely_util::builder_async::{ Event as AsyncEvent, OperatorBuilder as AsyncOperatorBuilder, PressOnDropButton, }; +use serde_json::json; use std::convert::Infallible; use timely::container::CapacityContainerBuilder; use timely::dataflow::StreamVec; @@ -312,6 +314,11 @@ where AsyncEvent::Data(cap, data) => { let mut pushed_any = false; for ((key, value, from_time), ts, diff) in data { + assert_always!( + diff.is_positive(), + "upsert: input diff positive (cf v2)", + &json!({"diff": diff.into_inner()}) + ); assert!(diff.is_positive(), "invalid upsert input"); if PartialOrder::less_equal(&input_upper, &resume_upper) && !resume_upper.less_equal(&ts) @@ -480,7 +487,13 @@ where (Some(a), Some(b)) => std::cmp::min(a, b).clone(), (Some(a), None) => a.clone(), (None, Some(b)) => b.clone(), - (None, None) => unreachable!(), + (None, None) => { + assert_unreachable!( + "upsert: cf v2 join produced (None, None)", + &json!({"site": "min_ts join"}) + ); + unreachable!() + } }; cap.downgrade(&min_ts); } else { diff --git a/test/antithesis/scratchbook/property-catalog.md b/test/antithesis/scratchbook/property-catalog.md index 0645f1e868414..40b390c85529c 100644 --- a/test/antithesis/scratchbook/property-catalog.md +++ b/test/antithesis/scratchbook/property-catalog.md @@ -326,6 +326,7 @@ Properties specific to the Kafka source ingestion pipeline: `KafkaSourceReader` |---|---| | **Type** | Reachability (Unreachable) | | **Priority** | P1 — these panics are explicit "should-never-happen" guards that bug history has hit | +| **Status** | **Implemented (SUT-side)** — every targeted site in `src/storage/src/upsert.rs` (stash diff-positive, `commands_state` missing key, `process_upsert_state_error` pending-future guard), `upsert_continual_feedback.rs` (stash diff-positive, `commands_state` missing key), `upsert_continual_feedback_v2.rs` (input diff-positive, `(None, None)` join), and `upsert/types.rs` (`merge_update_state` non-Consolidating, double snapshot completion) gets a uniquely-messaged `assert_always!`/`assert_unreachable!` paired with the original `panic!`/`assert!`. Panics still terminate the process; Antithesis now also receives a reportable property failure with rich details. | | **Property** | The explicit panics and `assert!`s in the upsert operator never fire under any Antithesis-injected fault sequence. Specifically: `assert!(diff.is_positive(), "invalid upsert input")` (upsert.rs:541, upsert_continual_feedback.rs:626, v2:315); `panic!("key missing from commands_state")` (upsert.rs:636, upsert_continual_feedback.rs:800); `unreachable!()` for `(None, None)` in continual-feedback v2 (v2:483); the order-key panic that used to live in `drain_staged_input` (now a skip; commit f177db8286). | | **Invariant** | `Unreachable`: each of these sites is converted to an Antithesis `assert_unreachable!("…")` (or `assert_always!(false, …)`) so that any firing produces an explicit Antithesis property failure rather than a process crash. Distinct, unique message per site. | | **Antithesis Angle** | These are the high-signal SUT-side anchors. They catch the same family of bugs that historically reached production: order-key regression, missing dedup entry, retraction-on-input. Adding them costs almost nothing in the SUT and gives Antithesis precise replay anchors. | @@ -337,6 +338,7 @@ Properties specific to the Kafka source ingestion pipeline: `KafkaSourceReader` |---|---| | **Type** | Safety | | **Priority** | P0 — directly guards upsert state-store data integrity; catches XOR/checksum corruption | +| **Status** | **Implemented (SUT-side)** — `src/storage/src/upsert/types.rs`. Five `assert_always!` calls inside `ensure_decoded` covering the `diff_sum == 1` checksum match, the three `diff_sum == 0` zero-residue checks, and the `diff_sum ∉ {0,1}` impossible-state path. Each carries the consolidating state's diagnostic in `details`. | | **Property** | When the upsert state backend's `StateValue::ensure_decoded` finalizes a `Consolidating` cell into either a live `Value` or a `tombstone`, the consolidating accumulator is well-formed: `diff_sum ∈ {0, 1}`; if `diff_sum == 1` the recovered bytes match the recorded `len_sum` and `checksum_sum` (seahash of `value_xor[..len_sum]`); if `diff_sum == 0` then `len_sum == 0`, `checksum_sum == 0`, and every byte of `value_xor` is zero. | | **Invariant** | `Always`: the `panic!("invalid upsert state: non 0/1 diff_sum: …")` at `upsert/types.rs:672` becomes an `assert_always!(false, "upsert: non 0/1 diff_sum")` with a unique message. The intermediate `assert_eq!`s at :621, :632, :637 and the `assert!` at :642 are likewise upgraded to `assert_always!` so they report rather than crash. Each site gets a distinct, specific message. | | **Antithesis Angle** | The consolidating state collapses many `(diff, bytes)` updates per key into running `diff_sum`, `len_sum`, `checksum_sum`, and an XOR-merged `value_xor` blob. The invariant relies on (a) every retraction being paired with an identical insertion in the snapshot stream, and (b) the snapshot completion contract delivering exactly the durable state at the resume frontier. Antithesis explores: crash mid-snapshot-replay, RocksDB merge operator interleaved with multi_put, partial feedback delivery across restart, and (most subtly) duplicated retractions from multi-replica drain (commit 1accbe28b3). Any of these can break the XOR cancellation and trip a non-{0,1} diff_sum. | @@ -348,6 +350,7 @@ Properties specific to the Kafka source ingestion pipeline: `KafkaSourceReader` |---|---| | **Type** | Reachability (Unreachable) | | **Priority** | P2 — type-state protocol invariant; high-signal as a replay anchor | +| **Status** | **Implemented (SUT-side)** — `src/storage/src/upsert/types.rs`. Six `assert_unreachable!` calls, one per accessor (`into_decoded`, `into_provisional_value`, `into_provisional_tombstone`, `provisional_order`, `provisional_value_ref`, `into_finalized_value`), each with a distinct message naming the accessor. Original `panic!` preserved after the assertion. | | **Property** | Every accessor on `StateValue` that requires the cell to be in `Value` form is preceded by a call to `ensure_decoded` for that cell. The six accessor panics — `into_decoded` (297), `into_provisional_value` (369), `into_provisional_tombstone` (403), `provisional_order` (416), `provisional_value_ref` (430), `into_finalized_value` (440) — never fire. | | **Invariant** | `Unreachable`: each `panic!("called \`...\` without calling \`ensure_decoded\`")` site is converted to a distinct `assert_unreachable!("upsert: on Consolidating")`. Six unique assertion messages, one per accessor, so an Antithesis report distinguishes which contract was violated. These are pure protocol-misuse guards — they cannot fire in valid execution. | | **Antithesis Angle** | These panics are most likely to fire after a code change to the upsert operator (e.g. a new code path that forgets `ensure_decoded` before reading `provisional_value`). Antithesis exercises every operator branch under fault injection; turning these into reachability assertions gives a cheap regression-detection net for future refactors of `upsert.rs` / `upsert_continual_feedback*.rs`. They are also useful replay anchors — if Antithesis ever does reach them, the bug is reproducible. | @@ -359,6 +362,7 @@ Properties specific to the Kafka source ingestion pipeline: `KafkaSourceReader` |---|---| | **Type** | Reachability (Unreachable) | | **Priority** | P1 — direct regression target for topic-recreation and offset-handling bugs | +| **Status** | **Implemented (SUT-side, production sites)** — `src/storage/src/source/kafka.rs` covers the four production panic/assert sites (`unexpected source export details`, `partition_consumers not drained at shutdown`, `partition missing from last_offsets`, `negative offset from non-error message`); `src/storage/src/source/reclock/compat.rs` covers `compare_and_append InvalidUsage`. The remaining `expect()` sites on resume-upper / statistics / offset arithmetic are deferred to a follow-up; they would be a wide mechanical conversion to soft assertions rather than reportable properties. | | **Property** | The explicit panics in `kafka.rs` never fire: `panic!("got negative offset (...)")` (kafka.rs:1193); `panic!("unexpected source export details: ...")` (kafka.rs:276); the `assert!(self.last_offsets[output][partition])` (kafka.rs:1142); plus the `expect()` sites on resume-upper / statistics / offset arithmetic. | | **Invariant** | `Unreachable`: each site converted to a unique `assert_unreachable!("kafka: ")`. The "negative offset" panic in particular is a known structural-invariant violation that has fired before. | | **Antithesis Angle** | Topic deletion + recreation, partition rebalancing, manual offset reset on the Kafka broker, clock jumps that interact with Kafka's internal offset arithmetic. Direct regression target for commit 99ad668af5 (capability downgrade on topic recreation). | From 12f2c795344dbe2b581693df3601c9070e89f71f Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Mon, 11 May 2026 17:40:15 -0400 Subject: [PATCH 20/65] test/antithesis: implement kafka-source-no-data-loss + kafka-source-no-data-duplication --- .../kafka-source-no-data-duplication.md | 9 + .../properties/kafka-source-no-data-loss.md | 15 ++ .../scratchbook/property-catalog.md | 2 + .../workload/test/helper_none_source.py | 53 +++++ .../parallel_driver_kafka_none_envelope.py | 208 ++++++++++++++++++ 5 files changed, 287 insertions(+) create mode 100644 test/antithesis/workload/test/helper_none_source.py create mode 100755 test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py diff --git a/test/antithesis/scratchbook/properties/kafka-source-no-data-duplication.md b/test/antithesis/scratchbook/properties/kafka-source-no-data-duplication.md index fba0e8348808f..21780e5d10211 100644 --- a/test/antithesis/scratchbook/properties/kafka-source-no-data-duplication.md +++ b/test/antithesis/scratchbook/properties/kafka-source-no-data-duplication.md @@ -39,6 +39,15 @@ Aggregates over the source double-count. Joins fan out. Downstream MVs become wr The runtime `assert!` in upsert.rs already aborts on negative input diffs — it just doesn't surface as an Antithesis property. Wrapping each callsite with `assert_always!` (per-site unique message) gives Antithesis the signal it needs without changing semantics outside Antithesis (the underlying `assert!` already aborts on violation). +## Implementation status + +Implemented 2026-05-11 in two halves: + +- **NONE envelope, workload-side**: `test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py` runs `SELECT partition, "offset", COUNT(*) FROM none_text_src WHERE text LIKE prefix:% GROUP BY 1,2 HAVING COUNT(*) > 1` after each catchup and asserts the result is empty via `always("kafka source: no duplicate (partition, offset)", details)`. Up to five offending rows are carried in `details` for triage. +- **UPSERT envelope, SUT-side**: the `assert_always!(diff.is_positive(), ...)` family added by `upsert-no-internal-panic` covers the "duplicate retraction on input" symptom directly inside the operator at the three call sites in `upsert.rs`, `upsert_continual_feedback.rs`, `upsert_continual_feedback_v2.rs`. The workload-side per-key dedup check is part of `upsert-key-reflects-latest-value`. + +Per-payload visibility (the inverse-pair `kafka-source-no-data-loss` check) shares the same driver — both run on the same produce + catchup cycle to maximize signal per invocation. + ## Provenance Surfaced by: Data Integrity, Concurrency, Failure Recovery. Direct regression target for database-issues#9160. diff --git a/test/antithesis/scratchbook/properties/kafka-source-no-data-loss.md b/test/antithesis/scratchbook/properties/kafka-source-no-data-loss.md index 2a451a32d4312..e999c42b76083 100644 --- a/test/antithesis/scratchbook/properties/kafka-source-no-data-loss.md +++ b/test/antithesis/scratchbook/properties/kafka-source-no-data-loss.md @@ -37,6 +37,21 @@ The interesting window is mid-batch crash: a clusterd kill between the persist s None. No `assert_sometimes!` in the source path today (verified against `existing-assertions.md`). To implement: add an `assert_sometimes!` in the persist sink's `append_batches` after a successful append, plus a workload-side `assert_sometimes!` after the quiet-period catch-up check. +## Implementation status + +Implemented 2026-05-11 (NONE envelope, workload-side) as `test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py`. The driver shares a flight with `kafka-source-no-data-duplication` because both check the same dataflow: + +| Message | Type | Fires when | +|---------|------|------------| +| `"kafka source caught up to produced offsets after quiet period (none envelope)"` | `sometimes` | Once per invocation after `wait_for_catchup`; the liveness anchor | +| `"kafka source: every produced payload is visible exactly once"` | `always` | Per produced payload, after catchup; carries `payload`, `present`, `observed_count` in details | + +The UPSERT-envelope arm of this property is covered by `upsert-key-reflects-latest-value`. + +The SUT-side `assert_sometimes!(persist_sink_appended_batch, ...)` anchor in `append_batches` is **deferred** — it would tighten replay anchoring but the workload check above is already specific enough that triage can localize a failure without it. + +New helper: `helper_none_source.py` — idempotent `CREATE SOURCE ... FORMAT TEXT INCLUDE PARTITION, OFFSET ENVELOPE NONE`, reusing the shared `antithesis_kafka_conn` connection from `helper_upsert_source.py`. + ## Provenance Surfaced by: Data Integrity, Failure Recovery, Product Context. diff --git a/test/antithesis/scratchbook/property-catalog.md b/test/antithesis/scratchbook/property-catalog.md index 40b390c85529c..d5a8ed8925e6e 100644 --- a/test/antithesis/scratchbook/property-catalog.md +++ b/test/antithesis/scratchbook/property-catalog.md @@ -226,6 +226,7 @@ Properties specific to the Kafka source ingestion pipeline: `KafkaSourceReader` |---|---| | **Type** | Liveness | | **Priority** | P0 — primary user-visible contract; "data is in Kafka but not in Materialize" is the worst possible streaming bug | +| **Status** | **Implemented (workload-side, NONE envelope)** — `test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py`. Per-payload `always("kafka source: every produced payload is visible exactly once", …)` joined to a quiet-period catchup wait. UPSERT-envelope version is covered by `upsert-key-reflects-latest-value`. The SUT-side `assert_sometimes!(persist_sink_appended_batch)` anchor in `append_batches` is deferred. | | **Property** | After producing a message to a Kafka topic, the Materialize source over that topic eventually contains a row corresponding to that message (NONE envelope) or a row reflecting the latest value for that key (UPSERT envelope). | | **Invariant** | `Sometimes(all_produced_records_visible)`: at least once during a run, after a quiet period, the workload observes `COUNT(*) FROM source` >= number of produced records (NONE) or every produced (key, value) pair is reflected in the source state (UPSERT). Liveness, so `Sometimes` on the catch-up event. | | **Antithesis Angle** | Network partitions between Materialize and Kafka, clusterd kills mid-ingestion, persist write retries, and rebalances. The interesting timing is the *crash mid-batch* window: some offsets are in persist, some are not, and the resume frontier determines what we re-read. Antithesis explores whether the re-read covers exactly the missing offsets. | @@ -237,6 +238,7 @@ Properties specific to the Kafka source ingestion pipeline: `KafkaSourceReader` |---|---| | **Type** | Safety | | **Priority** | P0 — silent duplication corrupts every aggregate downstream MV | +| **Status** | **Implemented (workload-side, NONE envelope)** — `test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py`. `always("kafka source: no duplicate (partition, offset)", …)` over a `GROUP BY partition, "offset" HAVING COUNT(*) > 1` query scoped to the invocation's prefix; carries up to five offending rows in `details`. UPSERT-envelope version is covered indirectly by `upsert-key-reflects-latest-value` (per-key uniqueness assertion) and directly by the SUT-side `assert_always!(diff.is_positive(), …)` of `upsert-no-internal-panic`. | | **Property** | After settling, the NONE-envelope source contains at most one row per `(partition, offset)` tuple; the UPSERT-envelope source contains at most one row per key. | | **Invariant** | `Always`: `SELECT partition, "offset", COUNT(*) FROM source GROUP BY 1,2 HAVING COUNT(*) > 1` returns no rows for NONE; `SELECT key, COUNT(*) FROM source GROUP BY 1 HAVING COUNT(*) > 1` returns no rows for UPSERT. Checked on every assertion firing — must hold on every observation. | | **Antithesis Angle** | Reader crashes between persist-sink batch write and `compare_and_append`; rehydration re-reads offsets we already wrote. The protection lives in `last_offsets` filtering (kafka.rs:1158) but only for the *current* incarnation — across restart, idempotency depends on the persist sink and (for UPSERT) the feedback-driven snapshot. Antithesis explores crash/restart timing across batch boundaries. Direct regression target for upsert double-retraction bug (commit 1accbe28b3, database-issues#9160). | diff --git a/test/antithesis/workload/test/helper_none_source.py b/test/antithesis/workload/test/helper_none_source.py new file mode 100644 index 0000000000000..e9ecb358675c8 --- /dev/null +++ b/test/antithesis/workload/test/helper_none_source.py @@ -0,0 +1,53 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Idempotent setup for the Antithesis NONE-envelope (append-only) Kafka source. + +Used by drivers that exercise the append-only contract. The source has columns +`text TEXT, partition INTEGER, offset BIGINT` — `partition` and `offset` are +the Kafka metadata projected via `INCLUDE PARTITION, OFFSET`, which give us +the per-`(partition, offset)` uniqueness check called out in +`kafka-source-no-data-duplication.md`. +""" + +from __future__ import annotations + +import logging +import os + +from helper_pg import execute_retry +from helper_upsert_source import ensure_kafka_connection + +LOG = logging.getLogger("antithesis.helper_none_source") + +CLUSTER = os.environ.get("MZ_ANTITHESIS_CLUSTER", "antithesis_cluster") + +TOPIC_NONE_TEXT = "antithesis-none-text" +SOURCE_NONE_TEXT = "none_text_src" + + +def ensure_none_text_source() -> None: + """Create the append-only source over a text-valued Kafka topic. + + Resulting columns: `text TEXT NOT NULL, partition INTEGER, offset BIGINT`. + Reuses the shared `antithesis_kafka_conn` Kafka connection so multiple + drivers don't proliferate connections. + """ + ensure_kafka_connection() + execute_retry( + f"CREATE SOURCE IF NOT EXISTS {SOURCE_NONE_TEXT} " + f"IN CLUSTER {CLUSTER} " + f"FROM KAFKA CONNECTION antithesis_kafka_conn (TOPIC '{TOPIC_NONE_TEXT}') " + f"FORMAT TEXT " + f"INCLUDE PARTITION, OFFSET " + f"ENVELOPE NONE" + ) + LOG.info( + "none-envelope source %s ready (topic=%s)", SOURCE_NONE_TEXT, TOPIC_NONE_TEXT + ) diff --git a/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py b/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py new file mode 100755 index 0000000000000..9c3c0e2461cbe --- /dev/null +++ b/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py @@ -0,0 +1,208 @@ +#!/usr/bin/env python3 +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Antithesis driver for the inverse-pair NONE-envelope properties: + - `kafka-source-no-data-loss` — every produced (partition, offset) is visible + - `kafka-source-no-data-duplication` — no (partition, offset) appears twice + +The two run on the same dataflow because they are the symmetric failure modes +of the same contract: one says "no row gone missing," the other says "no row +duplicated." Settling once and asserting both halves catches both bugs from +the same produce pass. + +Each invocation: + 1. Ensures the NONE-envelope source exists. + 2. Picks a per-invocation prefix so concurrent driver instances scope to + disjoint payloads. Every produced message has a `:` prefix so the + workload can filter the source down to its own rows when asserting. + 3. Produces N distinct payloads, recording the broker-assigned `(partition, + offset)` for each via the delivery callback. + 4. Requests an Antithesis quiet period and waits for `offset_committed` + to reach the highest produced offset. + 5. Runs two `assert_always` checks: + - "kafka source: no duplicate (partition, offset)" — `GROUP BY 1, 2 HAVING COUNT(*) > 1` is empty + - "kafka source: every produced payload is visible exactly once" — + fires per produced payload; payload, presence, and observed count + go into `details` so triage can localize which payloads went missing + or duplicated + 6. Records one `assert_sometimes` liveness anchor confirming the safety + checks ran against settled data. + +This is a `parallel_driver_` — many concurrent instances exercise the source +without colliding because each invocation owns its prefix range. +""" + +from __future__ import annotations + +import logging +import sys + +import helper_random +from helper_kafka import make_producer +from helper_none_source import ( + SOURCE_NONE_TEXT, + TOPIC_NONE_TEXT, + ensure_none_text_source, +) +from helper_pg import query_retry +from helper_quiet import request_quiet_period +from helper_source_stats import wait_for_catchup + +from antithesis.assertions import always, sometimes + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" +) +LOG = logging.getLogger("driver.kafka_none_envelope") + +# Knobs. Tuned so each invocation is a small, self-contained unit of work +# — Antithesis launches the driver many times and accumulates coverage +# across invocations, not within one giant batch. +PRODUCES_PER_INVOCATION = 50 +QUIET_PERIOD_S = 20 +CATCHUP_TIMEOUT_S = 60.0 + + +def main() -> int: + ensure_none_text_source() + + prefix = f"p{helper_random.random_u64():016x}" + LOG.info("driver starting; prefix=%s", prefix) + + producer, tracker = make_producer(client_id=f"antithesis-none-{prefix}") + + # The set of payloads we attempted to produce. Each is unique to + # (prefix, index) so we can filter the source on `text LIKE prefix:%` + # and join payloads back to (partition, offset) without tracking them + # at produce time. + expected_payloads: set[str] = set() + for i in range(PRODUCES_PER_INVOCATION): + payload = f"{prefix}:{i:06d}" + producer.produce( + topic=TOPIC_NONE_TEXT, + value=payload.encode("utf-8"), + on_delivery=tracker.callback, + ) + expected_payloads.add(payload) + producer.poll(0) + + pending = producer.flush(timeout=30) + if pending > 0 or tracker.last_error is not None: + # Same fail-closed pattern as the upsert driver: under sustained + # fault injection we cannot prove which messages Kafka accepted, so + # the expected set may name payloads the source never saw. Bail + # before running safety assertions. + LOG.info( + "skipping assertions: producer.flush pending=%d last_error=%s", + pending, + tracker.last_error, + ) + return 0 + + max_produced = tracker.topic_max_offset(TOPIC_NONE_TEXT) + if max_produced < 0: + LOG.info("no messages confirmed delivered this invocation; exiting cleanly") + return 0 + + # Each payload is unique to this invocation (prefix:NNNNNN), so the + # source query below joins payloads back to (partition, offset) + # assignments without us needing to track them at produce time. + + request_quiet_period(QUIET_PERIOD_S) + caught_up = wait_for_catchup( + SOURCE_NONE_TEXT, max_produced, timeout_s=CATCHUP_TIMEOUT_S + ) + + sometimes( + caught_up, + "kafka source caught up to produced offsets after quiet period (none envelope)", + {"source": SOURCE_NONE_TEXT, "target_offset": max_produced}, + ) + + if not caught_up: + LOG.info("catchup did not complete in budget; skipping per-payload assertions") + return 0 + + # ----- no-data-duplication ----- + # `GROUP BY partition, "offset" HAVING COUNT(*) > 1` filtered to this + # invocation's payloads. The catalog's `kafka-source-no-data-duplication` + # property names this exact query shape. + dup_rows = query_retry( + f""" + SELECT partition, "offset", COUNT(*)::bigint + FROM {SOURCE_NONE_TEXT} + WHERE text LIKE %s + GROUP BY 1, 2 + HAVING COUNT(*) > 1 + """, + (f"{prefix}:%",), + ) + always( + len(dup_rows) == 0, + "kafka source: no duplicate (partition, offset)", + { + "source": SOURCE_NONE_TEXT, + "prefix": prefix, + "dupe_count": len(dup_rows), + # Carry up to a handful of offending rows for triage. + "examples": [ + {"partition": int(p), "offset": int(o), "count": int(c)} + for (p, o, c) in dup_rows[:5] + ], + }, + ) + + # ----- no-data-loss ----- + # Confirm every payload we produced is visible *exactly once*. We do this + # via a left-join: enumerate produced payloads, ask the source for each. + # An always-pass requires every produced payload to map to exactly one + # source row whose `text` matches. + # + # We batch all payloads into one query rather than one round-trip per + # payload, so the assertion fires once per payload but the SQL cost + # stays bounded. + rows = query_retry( + f""" + SELECT text, partition, "offset", COUNT(*)::bigint + FROM {SOURCE_NONE_TEXT} + WHERE text LIKE %s + GROUP BY 1, 2, 3 + """, + (f"{prefix}:%",), + ) + by_payload: dict[str, tuple[int, int, int]] = {} + for text, partition, offset, count in rows: + by_payload[text] = (int(partition), int(offset), int(count)) + + for payload in expected_payloads: + info = by_payload.get(payload) + present = info is not None + count = info[2] if info else 0 + always( + present and count == 1, + "kafka source: every produced payload is visible exactly once", + { + "source": SOURCE_NONE_TEXT, + "prefix": prefix, + "payload": payload, + "present": present, + "observed_count": count, + }, + ) + + LOG.info( + "driver done; asserted no-dupe + per-payload visibility on %d produced payloads", + len(expected_payloads), + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From fd6722e7c5385372f70f88276960774e15e7fce1 Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Mon, 11 May 2026 17:48:51 -0400 Subject: [PATCH 21/65] test/antithesis: implement frontier-monotonic, tombstone-removes-key, state-rehydrates-correctly --- .../kafka-source-frontier-monotonic.md | 14 + .../upsert-state-rehydrates-correctly.md | 17 ++ .../upsert-tombstone-removes-key.md | 9 + .../scratchbook/property-catalog.md | 3 + .../test/anytime_kafka_frontier_monotonic.py | 136 ++++++++++ .../parallel_driver_upsert_latest_value.py | 23 ++ ...ngleton_driver_upsert_state_rehydration.py | 248 ++++++++++++++++++ 7 files changed, 450 insertions(+) create mode 100755 test/antithesis/workload/test/anytime_kafka_frontier_monotonic.py create mode 100755 test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py diff --git a/test/antithesis/scratchbook/properties/kafka-source-frontier-monotonic.md b/test/antithesis/scratchbook/properties/kafka-source-frontier-monotonic.md index 03f551e5cbd9f..b22aa8d0e6852 100644 --- a/test/antithesis/scratchbook/properties/kafka-source-frontier-monotonic.md +++ b/test/antithesis/scratchbook/properties/kafka-source-frontier-monotonic.md @@ -35,6 +35,20 @@ A: The retry loop does protect — but only if `sync()` is called *before* the l None. The persist-side `panic!("compare_and_append failed: …")` in `reclock/compat.rs:306` is informational, not a property. Wrap with `assert_unreachable!` for the genuinely-invalid case and add an `assert_always!` for the workload-observable monotonicity. +## Implementation status + +Implemented 2026-05-11 (workload-side) as `test/antithesis/workload/test/anytime_kafka_frontier_monotonic.py`. The `anytime_` driver runs throughout the timeline alongside other drivers while faults are active. Each poll iteration: + +1. Lists every source in `SOURCES = ["upsert_text_src", "none_text_src"]` that currently exists in the catalog (so an early-timeline poll before sources are created doesn't fire false negatives). +2. For each source, calls `helper_source_stats.offset_committed()` (a `MAX(offset_committed)` over `mz_internal.mz_source_statistics` joined to `mz_sources` by name). +3. Compares against the previous observation for that source in `last_seen`. The assertion `always("kafka: source offset_committed non-monotonic", details)` fires only when both observations succeeded — partition/clusterd unavailable is expected under faults and not an assertion target. + +`details` carries `source`, `previous`, `observed`, and `regression` (`previous - observed`). + +The SUT-side `assert_always!` in `append_batches` and the `reclock/compat.rs` `compare_and_append` paths (commit `e3805ad790`'s and `505dc96aaa`'s code paths) are deferred — the workload signal is sufficient to catch any externally-visible regression. Add SUT instrumentation later if Antithesis surfaces failures that need internal localization. + +The complementary `offset-known-not-below-committed` property is similar shape and could be added to this same driver with minimal cost; that's deliberately deferred to keep this commit scoped to the user-requested three properties. + ## Provenance Surfaced by: Data Integrity, Distributed Coordination. Direct regression target for commits `e3805ad790` and `505dc96aaa`. diff --git a/test/antithesis/scratchbook/properties/upsert-state-rehydrates-correctly.md b/test/antithesis/scratchbook/properties/upsert-state-rehydrates-correctly.md index 336deb408759b..287d967d02c47 100644 --- a/test/antithesis/scratchbook/properties/upsert-state-rehydrates-correctly.md +++ b/test/antithesis/scratchbook/properties/upsert-state-rehydrates-correctly.md @@ -41,6 +41,23 @@ Compounded by RocksDB merge operator behavior (commit `0d8d740b47`): if the merg None. Candidate SUT anchors: an `assert_sometimes!(upsert_snapshot_completed, "upsert: snapshot phase completed")` at the snapshot-completion call site, and `assert_always!(diff_sum_in_range, …)` mirroring the existing `panic!` in `ensure_decoded`. +## Implementation status + +Implemented 2026-05-11 (workload-side) as `test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py`. The `singleton_driver_` runs exactly once per timeline and lives across multiple produce/settle/assert cycles, holding `expected_state` in process memory across cycles: + +| Message | Type | Fires when | +|---------|------|------------| +| `"upsert: rehydrated state matches local model (live key)"` | `always` | Per live key, per cycle, after catchup. Cross-cycle stability of `expected` is the rehydration check. | +| `"upsert: rehydrated state matches local model (tombstoned key)"` | `always` | Per tombstoned key, per cycle, after catchup. | +| `"upsert: rehydration driver ran 2+ assertion cycles"` | `sometimes` | Once per invocation; confirms the safety check ran against multiple settle cycles (not just one early cycle that masks rehydration). | +| `"upsert: rehydration driver observed clusterd replica non-online"` | `sometimes` | Best-effort proxy: `mz_internal.mz_cluster_replica_statuses` showed an `antithesis_cluster` replica in a non-`online` status during the run. Not a guarantee that a restart happened, but a noisy yes-signal that something disturbed the cluster. | + +Knobs: `CYCLE_COUNT=8`, `PRODUCES_PER_CYCLE=30`, `DISTINCT_KEYS=6` (small enough that keys are revisited within and across cycles), `TOMBSTONE_PROB=0.20`, `QUIET_PERIOD_S=25`, `CATCHUP_TIMEOUT_S=120`, `INTER_CYCLE_SLEEP_S=2`. + +**Requires node-termination faults enabled** in the Antithesis tenant for the property to be exercised at full strength. Without restarts, the cross-cycle stability check still catches divergence from the operator processing a sequence of upserts/tombstones (i.e., it falls back to a slower version of `upsert-key-reflects-latest-value`). + +SUT-side anchors at the upsert snapshot-completion call sites are deferred and would tighten replay anchoring. + ## Provenance Surfaced by: Failure Recovery, Data Integrity. diff --git a/test/antithesis/scratchbook/properties/upsert-tombstone-removes-key.md b/test/antithesis/scratchbook/properties/upsert-tombstone-removes-key.md index 74f5f13a7ba49..50ee185c746f1 100644 --- a/test/antithesis/scratchbook/properties/upsert-tombstone-removes-key.md +++ b/test/antithesis/scratchbook/properties/upsert-tombstone-removes-key.md @@ -33,6 +33,15 @@ A deleted row reappears after restart. Compliance and correctness hazard. The li None. Workload-side check. The `StateValue::tombstone` construction path and the `ensure_decoded` tombstone branch are the relevant code; adding `assert_sometimes!(tombstone_emitted, ...)` inside the tombstone-emit path gives a coverage signal. +## Implementation status + +Implemented 2026-05-11 (workload-side) inside the existing `parallel_driver_upsert_latest_value.py`: + +- Safety half: `always("upsert: tombstoned key has no row in source", ...)` (already existed for `upsert-key-reflects-latest-value`) — fires per key whose latest produced message was a tombstone. +- Path-exercise anchor: new `sometimes("upsert: tombstone overwrote a live value at least once this invocation", ...)`. The driver counts `tombstoned_after_value` — the number of tombstone produces where the immediately-prior produced value for that key was a live value. Without this anchor, the `always` could be vacuously satisfied by tombstones against never-written keys. + +The "no resurrection across restart" half is covered structurally by `upsert-state-rehydrates-correctly`'s cross-cycle stability check, which includes tombstoned keys in its per-key assertion loop (`"upsert: rehydrated state matches local model (tombstoned key)"`). + ## Provenance Surfaced by: Data Integrity, Lifecycle Transitions (delete operations). diff --git a/test/antithesis/scratchbook/property-catalog.md b/test/antithesis/scratchbook/property-catalog.md index d5a8ed8925e6e..9e94cdf8ed089 100644 --- a/test/antithesis/scratchbook/property-catalog.md +++ b/test/antithesis/scratchbook/property-catalog.md @@ -250,6 +250,7 @@ Properties specific to the Kafka source ingestion pipeline: `KafkaSourceReader` |---|---| | **Type** | Safety | | **Priority** | P1 — frontier regression panics downstream operators and breaks `AS OF` queries | +| **Status** | **Implemented (workload-side)** — `test/antithesis/workload/test/anytime_kafka_frontier_monotonic.py`. Continuous `anytime_` driver polls `mz_internal.mz_source_statistics.offset_committed` for every known Kafka source every 500ms and asserts `always("kafka: source offset_committed non-monotonic", details)` whenever a new sample is less than the previous one. Faults are active throughout. SUT-side `assert_always!(new_upper >= prev_upper, ...)` in `append_batches` is deferred. | | **Property** | The `upper` frontier of the source's data persist shard never regresses across the lifetime of the source, including across clusterd restarts and `compare_and_append` retries. | | **Invariant** | `Always`: observed `upper(t2) >= upper(t1)` for any observation order `t1 < t2`. Checked on every observation in a workload polling loop, and ideally also as a SUT-side `assert_always!` next to the persist sink's `compare_and_append`. | | **Antithesis Angle** | Kill clusterd mid-`compare_and_append`; resume the source with a stale cached upper; concurrent reclock and persist-sink writers. Direct regression target for the `as_of`/reclock-upper race (commit e3805ad790, database-issues#8698) and the persist-sink cached upper bug (commit 505dc96aaa). | @@ -295,6 +296,7 @@ Properties specific to the Kafka source ingestion pipeline: `KafkaSourceReader` |---|---| | **Type** | Safety | | **Priority** | P1 — delete semantics are routinely relied on for GDPR/correctness | +| **Status** | **Implemented (workload-side)** — `test/antithesis/workload/test/parallel_driver_upsert_latest_value.py`. The existing `always("upsert: tombstoned key has no row in source", ...)` covers the safety half; a new `sometimes("upsert: tombstone overwrote a live value at least once this invocation", ...)` confirms the *interesting* tombstone path (tombstone replacing a live value) is exercised rather than the trivial "tombstone a never-written key" case. | | **Property** | After producing a `(key, null)` tombstone message to the Kafka topic, the UPSERT source eventually contains no row for that key, and the row stays absent until a new non-null value is produced. | | **Invariant** | `Always`: at any settled observation after the tombstone has been ingested (resume_upper > tombstone offset), `SELECT * FROM source WHERE key = ?` returns 0 rows. The "no resurrection" half is also `Always`: a key that has been tombstoned and not re-inserted must not reappear after a clusterd restart or rehydration cycle. | | **Antithesis Angle** | Race the tombstone against a state-store snapshot completion. Crash clusterd between persist sink writing the retraction and the upsert state recording the tombstone. The `StateValue::Value` -> tombstone path in `upsert/types.rs` is the relevant code; bugs here look like resurrected rows. | @@ -306,6 +308,7 @@ Properties specific to the Kafka source ingestion pipeline: `KafkaSourceReader` |---|---| | **Type** | Safety | | **Priority** | P1 — incorrect rehydration produces wrong-but-plausible-looking output | +| **Status** | **Implemented (workload-side)** — `test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py`. Long-running `singleton_driver_` runs N produce→settle→assert cycles holding `expected_state` in process memory. Cross-cycle stability is the rehydration check: if a clusterd restart lands between cycles, the next cycle's `always("upsert: rehydrated state matches local model (live key|tombstoned key)", ...)` verifies the rebuilt source matches the pre-restart model. Requires node-termination faults enabled. | | **Property** | After a clusterd restart, the rehydrated upsert state, as observed via `SELECT * FROM source`, equals the state at the most recent durable timestamp before the restart, for every key produced so far. | | **Invariant** | `Always`: after a kill+restart quiet period, the workload's local key/value model matches the source's contents for every key whose latest message has `offset <= resume_upper`. Combines with `kafka-source-no-data-duplication` (no double inserts on rehydration) and `upsert-key-reflects-latest-value` (correct value per key). | | **Antithesis Angle** | The interesting window is between `compare_and_append` of the persist sink and the upsert operator's feedback-driven snapshot completion. If the feedback replay deduplication is wrong, rehydrated state diverges from durable state. Direct regression target for the upsert snapshot-completion logic in `upsert/types.rs` and `upsert_continual_feedback*`. | diff --git a/test/antithesis/workload/test/anytime_kafka_frontier_monotonic.py b/test/antithesis/workload/test/anytime_kafka_frontier_monotonic.py new file mode 100755 index 0000000000000..faee0fd0c680e --- /dev/null +++ b/test/antithesis/workload/test/anytime_kafka_frontier_monotonic.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Antithesis driver for `kafka-source-frontier-monotonic`. + +The `upper` of a Kafka source's persist data shard must never regress across +its lifetime, including across clusterd restarts and `compare_and_append` +retries. Approximated via the workload-visible `offset_committed` reported +in `mz_internal.mz_source_statistics`, which is the durably-ingested +upstream offset for the source. + +This is an `anytime_` driver — it runs continuously throughout the timeline, +polling all of this workload's Kafka sources and asserting that each one's +`offset_committed` never decreases between successive observations. Faults +are active while it runs, which is the right shape for a continuous safety +invariant: Antithesis can crash clusterd between two of our polls and the +next poll must still report a value >= the previous one. + +The driver exits after a bounded budget so Antithesis can re-launch it +freely without one instance pinning resources. Cross-invocation: each +instance reads the state from before-restart only via `offset_committed` +itself (no in-process memory carries across) — `last_seen` is reset on each +launch, but Antithesis runs many instances in parallel and the union of +their observations covers the regression window. + +Errors during polling (network partitions, clusterd unavailable) are +*expected* under fault injection and must not produce false-positive +failures. We only assert when we have two successive successful reads for +the same source. +""" + +from __future__ import annotations + +import logging +import sys +import time + +from helper_pg import query_retry +from helper_source_stats import offset_committed + +from antithesis.assertions import always + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" +) +LOG = logging.getLogger("driver.kafka_frontier_monotonic") + +# Knobs. +POLL_INTERVAL_S = 0.5 +RUN_BUDGET_S = 30.0 + +# The Antithesis cluster every driver in this workload provisions sources into. +# Discovering sources dynamically (rather than hardcoding names) means new +# drivers that introduce new Kafka sources get monotonicity coverage for free. +ANTITHESIS_CLUSTER = "antithesis_cluster" + + +def _sources_present() -> list[str]: + """Return every Kafka source currently owned by `antithesis_cluster`.""" + rows = query_retry( + """ + SELECT s.name + FROM mz_sources s + JOIN mz_clusters c ON c.id = s.cluster_id + WHERE c.name = %s AND s.type = 'kafka' + """, + (ANTITHESIS_CLUSTER,), + ) + return [r[0] for r in rows] + + +def main() -> int: + deadline = time.monotonic() + RUN_BUDGET_S + # Per-source highest committed offset observed across this invocation's + # polls. Each successful new read for a source must be >= last_seen. + last_seen: dict[str, int] = {} + polled = 0 + + while time.monotonic() < deadline: + try: + sources = _sources_present() + except Exception as exc: # noqa: BLE001 + LOG.info("source list query failed: %s; sleeping and retrying", exc) + time.sleep(POLL_INTERVAL_S) + continue + + for source in sources: + try: + observed = offset_committed(source) + except Exception as exc: # noqa: BLE001 + LOG.info("offset_committed query failed for %s: %s", source, exc) + continue + if observed is None: + # Statistics row not initialized yet (very early in source + # lifetime, or post-restart before stats first reported). + # Not an assertion target. + continue + + prev = last_seen.get(source) + if prev is not None: + always( + observed >= prev, + "kafka: source offset_committed non-monotonic", + { + "source": source, + "previous": prev, + "observed": observed, + "regression": prev - observed, + }, + ) + + # Always update last_seen, even on regression — we want to keep + # asserting against the most recent observation so a regression + # surfaces once per discrete drop, not on every subsequent poll. + last_seen[source] = observed + polled += 1 + + time.sleep(POLL_INTERVAL_S) + + LOG.info( + "frontier monotonic check done; %d samples across %d sources", + polled, + len(last_seen), + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py b/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py index 7aa54acb3192d..066620aaf6ded 100755 --- a/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py +++ b/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py @@ -117,10 +117,19 @@ def main() -> int: # Value of None means "the last message was a tombstone". expected: dict[str, str | None] = {} + # Count of times we tombstoned a key whose immediately-prior produced + # value was a live value (not absent, not already tombstoned). This is + # the exact `upsert-tombstone-removes-key` exercise pattern: the + # interesting case is "remove a row that was just there," not "tombstone + # a key we never wrote to." + tombstoned_after_value = 0 + keys = [f"{prefix}-k{i}" for i in range(DISTINCT_KEYS)] for _ in range(PRODUCES_PER_INVOCATION): key = helper_random.random_choice(keys) if helper_random.random_bool(TOMBSTONE_PROB): + if expected.get(key) is not None: + tombstoned_after_value += 1 _produce(producer, tracker, TOPIC_UPSERT_TEXT, key, None) expected[key] = None else: @@ -203,6 +212,20 @@ def main() -> int: }, ) + # Liveness anchor for `upsert-tombstone-removes-key`: confirms the + # interesting tombstone path (tombstone replacing a live value) was + # exercised at least once during the run. Without this, the + # `always(not found, "upsert: tombstoned key has no row in source", ...)` + # check above might fire only against keys that were never live. + sometimes( + tombstoned_after_value > 0, + "upsert: tombstone overwrote a live value at least once this invocation", + { + "tombstoned_after_value": tombstoned_after_value, + "produces": PRODUCES_PER_INVOCATION, + }, + ) + LOG.info("driver done; asserted on %d keys", len(expected)) return 0 diff --git a/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py b/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py new file mode 100755 index 0000000000000..5c41c406f3210 --- /dev/null +++ b/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py @@ -0,0 +1,248 @@ +#!/usr/bin/env python3 +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Antithesis driver for `upsert-state-rehydrates-correctly`. + +After a clusterd restart, the rehydrated upsert state — observed via +`SELECT * FROM source` — must equal the state at the most recent durable +timestamp before the restart, for every key produced so far. + +Implementation strategy: a `singleton_driver_` runs exactly once per +timeline and lives long enough to span multiple produce/settle/assert +cycles. Local memory holds the authoritative "what the source should say" +model across cycles. If Antithesis kills clusterd between two cycles, the +next cycle's `SELECT` is effectively a rehydration check — and because the +local model is unchanged across the restart, any divergence in the source +output is exactly the property's failure mode. + +Each cycle: + 1. Produce a batch of (key, value) and (key, null) messages, updating the + in-memory `expected_state` model. + 2. Request a quiet period and wait for `offset_committed` to reach the + highest produced offset. + 3. SELECT every tracked key's current source state and assert it matches + `expected_state` via `always("upsert: rehydrated state equals + local model", ...)`. Across-cycle stability is exactly what + rehydration correctness is. + +The driver also records one `sometimes` anchor confirming that at least +two assertion-bearing cycles ran (without this, the safety check could be +vacuously satisfied by a single early settle), and a second anchor +confirming clusterd was observed unavailable between cycles (best-effort +proxy for "restart happened" — the helper_pg retry budget makes connect +errors very rare under normal operation). + +Distinct prefix per timeline keeps multiple parallel timelines independent. +""" + +from __future__ import annotations + +import logging +import sys +import time + +import helper_random +from helper_kafka import make_producer +from helper_pg import query_one_retry +from helper_quiet import request_quiet_period +from helper_source_stats import wait_for_catchup +from helper_upsert_source import ( + SOURCE_UPSERT_TEXT, + TOPIC_UPSERT_TEXT, + ensure_upsert_text_source, +) + +from antithesis.assertions import always, sometimes + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" +) +LOG = logging.getLogger("driver.upsert_state_rehydration") + +# Long-running knobs — this driver owns its timeline alongside parallel +# drivers, so the per-cycle budget is generous and the cycle count high +# enough that a node-termination fault has a chance to land between cycles. +CYCLE_COUNT = 8 +PRODUCES_PER_CYCLE = 30 +DISTINCT_KEYS = 6 +DISTINCT_VALUES = 12 +TOMBSTONE_PROB = 0.20 + +QUIET_PERIOD_S = 25 +CATCHUP_TIMEOUT_S = 120.0 +INTER_CYCLE_SLEEP_S = 2.0 + + +def _select_value_for_key(key: str) -> tuple[bool, str | None]: + """Duplicate of `_select_value_for_key` in `parallel_driver_upsert_latest_value.py`. + Kept inline to avoid expanding helper surface for one shared private function.""" + row = query_one_retry( + f"SELECT count(*)::bigint, max(text) FROM {SOURCE_UPSERT_TEXT} WHERE key = %s", + (key,), + ) + if row is None: + return False, None + count, value = row + if count == 0: + return False, None + if count != 1: + raise RuntimeError( + f"upsert source has {count} rows for key {key!r}; this driver " + "assumes the per-key uniqueness property holds (see " + "`upsert-key-reflects-latest-value` and " + "`kafka-source-no-data-duplication`)" + ) + return True, value + + +def _saw_clusterd_unavailable() -> bool: + """Best-effort probe: does `mz_internal.mz_cluster_replica_statuses` show + any `antithesis_cluster` replica with `status != 'online'` right now? + The status column reports `online` or `offline`. Catching `offline` + in a snapshot doesn't *prove* a restart happened (we may have missed + a transient flap entirely), but it's a noisy yes-signal that something + disturbed the cluster during the cycle. + """ + try: + row = query_one_retry(""" + SELECT EXISTS ( + SELECT 1 + FROM mz_internal.mz_cluster_replica_statuses s + JOIN mz_cluster_replicas r ON r.id = s.replica_id + JOIN mz_clusters c ON c.id = r.cluster_id + WHERE c.name = 'antithesis_cluster' AND s.status != 'online' + ) + """) + except Exception: # noqa: BLE001 + return False + return bool(row and row[0]) + + +def _run_cycle( + producer, tracker, expected: dict[str, str | None], cycle_idx: int +) -> bool: + """Produce one batch, settle, and assert state for every tracked key. + + Returns True if assertions ran (cycle settled), False if we bailed early. + """ + keys = [f"reh-k{i}" for i in range(DISTINCT_KEYS)] + for _ in range(PRODUCES_PER_CYCLE): + key = helper_random.random_choice(keys) + if helper_random.random_bool(TOMBSTONE_PROB): + producer.produce( + topic=TOPIC_UPSERT_TEXT, + key=key.encode("utf-8"), + value=None, + on_delivery=tracker.callback, + ) + expected[key] = None + else: + value = f"reh-v{cycle_idx:02d}-{helper_random.random_int(0, DISTINCT_VALUES - 1):04d}" + producer.produce( + topic=TOPIC_UPSERT_TEXT, + key=key.encode("utf-8"), + value=value.encode("utf-8"), + on_delivery=tracker.callback, + ) + expected[key] = value + producer.poll(0) + + pending = producer.flush(timeout=30) + if pending > 0 or tracker.last_error is not None: + LOG.info( + "cycle %d: skipping assertions; flush pending=%d last_error=%s", + cycle_idx, + pending, + tracker.last_error, + ) + return False + + max_produced = tracker.topic_max_offset(TOPIC_UPSERT_TEXT) + if max_produced < 0: + LOG.info("cycle %d: no messages confirmed delivered; skipping", cycle_idx) + return False + + request_quiet_period(QUIET_PERIOD_S) + caught_up = wait_for_catchup( + SOURCE_UPSERT_TEXT, max_produced, timeout_s=CATCHUP_TIMEOUT_S + ) + if not caught_up: + LOG.info( + "cycle %d: catchup did not complete in budget; skipping asserts", cycle_idx + ) + return False + + # Per-key assertion. The cross-cycle stability of `expected` is what + # makes this a rehydration check: if a clusterd restart happened + # between this cycle and the previous, the source has been rebuilt + # from feedback and must agree with `expected` again. + for key, want in expected.items(): + found, observed = _select_value_for_key(key) + if want is None: + always( + not found, + "upsert: rehydrated state matches local model (tombstoned key)", + { + "source": SOURCE_UPSERT_TEXT, + "key": key, + "cycle": cycle_idx, + "observed_value": observed, + }, + ) + else: + always( + found and observed == want, + "upsert: rehydrated state matches local model (live key)", + { + "source": SOURCE_UPSERT_TEXT, + "key": key, + "cycle": cycle_idx, + "expected_value": want, + "observed_present": found, + "observed_value": observed, + }, + ) + return True + + +def main() -> int: + ensure_upsert_text_source() + LOG.info("rehydration driver starting; %d cycles planned", CYCLE_COUNT) + + producer, tracker = make_producer(client_id="antithesis-rehydration") + expected: dict[str, str | None] = {} + + cycles_run = 0 + saw_replica_unavailable = False + + for cycle_idx in range(CYCLE_COUNT): + if _run_cycle(producer, tracker, expected, cycle_idx): + cycles_run += 1 + if _saw_clusterd_unavailable(): + saw_replica_unavailable = True + time.sleep(INTER_CYCLE_SLEEP_S) + + sometimes( + cycles_run >= 2, + "upsert: rehydration driver ran 2+ assertion cycles", + {"cycles_run": cycles_run, "cycles_planned": CYCLE_COUNT}, + ) + sometimes( + saw_replica_unavailable, + "upsert: rehydration driver observed clusterd replica non-online", + {"cycles_run": cycles_run}, + ) + + LOG.info("rehydration driver done; %d/%d cycles ran", cycles_run, CYCLE_COUNT) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From bb02873d824a0f83d5878186f97ec18fde7ccb1b Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Mon, 11 May 2026 18:24:56 -0400 Subject: [PATCH 22/65] ci: scope CI_ANTITHESIS build to materialized + antithesis-{workload,config} + transitive deps --- ci/test/build.py | 20 +++++++++++++++++++- test/antithesis/push-antithesis.py | 3 +++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/ci/test/build.py b/ci/test/build.py index 5b18ce91e9b31..95f4227afbaa7 100755 --- a/ci/test/build.py +++ b/ci/test/build.py @@ -47,7 +47,25 @@ def main() -> None: # Build and push any images that are not already available on Docker Hub, # so they are accessible to other build agents. print("--- Acquiring mzbuild images") - deps = repo.resolve_dependencies(image for image in repo if image.publish) + if antithesis: + # Antithesis only consumes these three images; everything else in + # the repo (balancerd, sqllogictest, testdrive, ...) is wasted CI + # time for this pipeline. resolve_dependencies walks depends_on + # transitively, so anything materialized actually needs still + # comes along. Keep this list in sync with ANTITHESIS_IMAGES in + # test/antithesis/push-antithesis.py. + antithesis_images = [ + "materialized", + "antithesis-workload", + "antithesis-config", + ] + deps = repo.resolve_dependencies( + repo.images[name] for name in antithesis_images + ) + else: + deps = repo.resolve_dependencies( + image for image in repo if image.publish + ) deps.ensure(pre_build=lambda images: upload_debuginfo(repo, images)) set_build_status("success") annotate_buildkite_with_tags(repo.rd.arch, deps) diff --git a/test/antithesis/push-antithesis.py b/test/antithesis/push-antithesis.py index 2787f5cee8f30..fe1dc7555ea74 100755 --- a/test/antithesis/push-antithesis.py +++ b/test/antithesis/push-antithesis.py @@ -37,6 +37,9 @@ # Images Antithesis needs to be able to pull: # - antithesis-config holds the docker-compose.yaml + .env Antithesis runs. # - materialized + antithesis-workload are referenced by that compose. +# Keep this list in sync with the `antithesis_images` branch in +# ci/test/build.py — that's where CI_ANTITHESIS scopes the mzbuild walk so +# the nightly doesn't waste time building images Antithesis never consumes. ANTITHESIS_IMAGES = ["materialized", "antithesis-workload", "antithesis-config"] From 0a1fa97d3510001d54134c98ad953e070097282e Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Mon, 11 May 2026 20:48:21 -0400 Subject: [PATCH 23/65] test/antithesis: pre-create kafka topics before CREATE SOURCE --- test/antithesis/workload/test/helper_none_source.py | 6 ++++++ test/antithesis/workload/test/helper_upsert_source.py | 10 +++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/test/antithesis/workload/test/helper_none_source.py b/test/antithesis/workload/test/helper_none_source.py index e9ecb358675c8..a3cb3c1704be1 100644 --- a/test/antithesis/workload/test/helper_none_source.py +++ b/test/antithesis/workload/test/helper_none_source.py @@ -21,6 +21,7 @@ import logging import os +from helper_kafka import ensure_topic from helper_pg import execute_retry from helper_upsert_source import ensure_kafka_connection @@ -40,6 +41,11 @@ def ensure_none_text_source() -> None: drivers don't proliferate connections. """ ensure_kafka_connection() + # CREATE SOURCE issues a Kafka metadata fetch that fails fast if the topic + # is missing; broker auto-create only fires on a producer write, which + # comes later in the driver. Pre-create via admin client so the metadata + # fetch succeeds on the first run. + ensure_topic(TOPIC_NONE_TEXT) execute_retry( f"CREATE SOURCE IF NOT EXISTS {SOURCE_NONE_TEXT} " f"IN CLUSTER {CLUSTER} " diff --git a/test/antithesis/workload/test/helper_upsert_source.py b/test/antithesis/workload/test/helper_upsert_source.py index 59332b28d64e9..e5c8ac1cc6e6a 100644 --- a/test/antithesis/workload/test/helper_upsert_source.py +++ b/test/antithesis/workload/test/helper_upsert_source.py @@ -9,9 +9,11 @@ """Idempotent setup for the Antithesis UPSERT-envelope Kafka source. -Used by all drivers that exercise UPSERT semantics. Topic is auto-created by -the Kafka broker on first produce; the source/connection are created at most -once across all drivers (CREATE ... IF NOT EXISTS). +Used by all drivers that exercise UPSERT semantics. The topic is pre-created +via the Kafka admin client (broker auto-create only triggers on producer +write, but CREATE SOURCE does a metadata fetch that fails fast otherwise). +The source/connection are created at most once across all drivers +(CREATE ... IF NOT EXISTS). """ from __future__ import annotations @@ -19,6 +21,7 @@ import logging import os +from helper_kafka import ensure_topic from helper_pg import execute_retry LOG = logging.getLogger("antithesis.helper_upsert_source") @@ -44,6 +47,7 @@ def ensure_upsert_text_source() -> None: The resulting source has columns `key TEXT NOT NULL` and `text TEXT`. """ ensure_kafka_connection() + ensure_topic(TOPIC_UPSERT_TEXT) execute_retry( f"CREATE SOURCE IF NOT EXISTS {SOURCE_UPSERT_TEXT} " f"IN CLUSTER {CLUSTER} " From 624149c959c866bd8a0ec49e06dac293047d732a Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Mon, 11 May 2026 22:10:44 -0400 Subject: [PATCH 24/65] test/antithesis: tolerate orphan _progress collision + add upsert-v2 first_ selector --- .../first_select_upsert_implementation.py | 61 +++++++++++++++++++ .../workload/test/helper_none_source.py | 7 ++- test/antithesis/workload/test/helper_pg.py | 59 ++++++++++++++++++ .../workload/test/helper_upsert_source.py | 7 ++- 4 files changed, 128 insertions(+), 6 deletions(-) create mode 100755 test/antithesis/workload/test/first_select_upsert_implementation.py diff --git a/test/antithesis/workload/test/first_select_upsert_implementation.py b/test/antithesis/workload/test/first_select_upsert_implementation.py new file mode 100755 index 0000000000000..03394a1ebd7f7 --- /dev/null +++ b/test/antithesis/workload/test/first_select_upsert_implementation.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Antithesis test command: pick v1 or v2 of the upsert continual feedback +operator at the start of each timeline. + +The selection is made via `helper_random.random_u64()` (routes through the +Antithesis SDK for deterministic replay) and applied via `ALTER SYSTEM SET +enable_upsert_v2 = ...` against the `mz_system` internal port. Because this +script is a `first_*` Test Composer action it runs after `setup-complete` +but before any `parallel_driver_*` / `singleton_driver_*` creates a source, +so every source rendered in this timeline reads the chosen value. + +Each branch records a `sometimes` assertion so Antithesis surfaces "v1 +covered" and "v2 covered" as separate dashboard signals — if either ever +goes 0/N across the run, we've lost that arm of coverage. +""" + +from __future__ import annotations + +import logging +import sys + +import helper_random +from helper_pg import execute_internal_retry + +from antithesis.assertions import sometimes + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" +) +LOG = logging.getLogger("first.select_upsert_implementation") + + +def main() -> int: + # Low bit of a SDK-sourced u64 — under Antithesis this routes through the + # SDK so timeline replay picks the same arm; outside Antithesis it falls + # back to a stdlib-seeded RNG (see helper_random). + enable_v2 = (helper_random.random_u64() & 1) == 1 + LOG.info("rolled enable_upsert_v2=%s for this timeline", enable_v2) + + # Set explicitly in both branches so the chosen value is part of the + # timeline's recorded state, not implicit in the bootstrap default. + if enable_v2: + execute_internal_retry("ALTER SYSTEM SET enable_upsert_v2 = true") + sometimes(True, "upsert continual feedback v2 enabled for timeline", {}) + else: + execute_internal_retry("ALTER SYSTEM SET enable_upsert_v2 = false") + sometimes(True, "upsert continual feedback v1 enabled for timeline", {}) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/antithesis/workload/test/helper_none_source.py b/test/antithesis/workload/test/helper_none_source.py index a3cb3c1704be1..87a90b1ac6087 100644 --- a/test/antithesis/workload/test/helper_none_source.py +++ b/test/antithesis/workload/test/helper_none_source.py @@ -22,7 +22,7 @@ import os from helper_kafka import ensure_topic -from helper_pg import execute_retry +from helper_pg import create_source_idempotent from helper_upsert_source import ensure_kafka_connection LOG = logging.getLogger("antithesis.helper_none_source") @@ -46,13 +46,14 @@ def ensure_none_text_source() -> None: # comes later in the driver. Pre-create via admin client so the metadata # fetch succeeds on the first run. ensure_topic(TOPIC_NONE_TEXT) - execute_retry( + create_source_idempotent( f"CREATE SOURCE IF NOT EXISTS {SOURCE_NONE_TEXT} " f"IN CLUSTER {CLUSTER} " f"FROM KAFKA CONNECTION antithesis_kafka_conn (TOPIC '{TOPIC_NONE_TEXT}') " f"FORMAT TEXT " f"INCLUDE PARTITION, OFFSET " - f"ENVELOPE NONE" + f"ENVELOPE NONE", + SOURCE_NONE_TEXT, ) LOG.info( "none-envelope source %s ready (topic=%s)", SOURCE_NONE_TEXT, TOPIC_NONE_TEXT diff --git a/test/antithesis/workload/test/helper_pg.py b/test/antithesis/workload/test/helper_pg.py index d90babf162baf..d336905b1914b 100644 --- a/test/antithesis/workload/test/helper_pg.py +++ b/test/antithesis/workload/test/helper_pg.py @@ -31,6 +31,10 @@ PGUSER = os.environ.get("PGUSER", "materialize") PGDATABASE = os.environ.get("PGDATABASE", "materialize") +# Internal pgwire endpoint for system-privileged operations (ALTER SYSTEM SET). +PGPORT_INTERNAL = int(os.environ.get("PGPORT_INTERNAL", "6877")) +PGUSER_INTERNAL = os.environ.get("PGUSER_INTERNAL", "mz_system") + # Retry tuning. Antithesis injects partitions and node hangs; conservative bounds # keep drivers progressing without masking real correctness signals. _CONNECT_TIMEOUT_S = 5 @@ -118,3 +122,58 @@ def query_one_retry( ) -> tuple[Any, ...] | None: rows = query_retry(sql, params) return rows[0] if rows else None + + +def execute_internal_retry(sql: str, params: Sequence[Any] | None = None) -> None: + """Execute a system-privileged statement on the internal port (mz_system). + + Used for ALTER SYSTEM SET and other operations the regular `materialize` + role cannot perform. Retries the same transient errors as `execute_retry`. + """ + deadline = time.monotonic() + _RETRY_BUDGET_S + backoff = _RETRY_INITIAL_S + while True: + try: + with psycopg.connect( + host=PGHOST, + port=PGPORT_INTERNAL, + user=PGUSER_INTERNAL, + dbname=PGDATABASE, + connect_timeout=_CONNECT_TIMEOUT_S, + autocommit=True, + ) as conn, conn.cursor() as cur: + cur.execute(sql, params or ()) + return + except Exception as exc: # noqa: BLE001 + if not _retryable(exc) or time.monotonic() > deadline: + raise + LOG.info("pg internal execute retrying after %s", exc) + time.sleep(backoff) + backoff = min(backoff * 2, _RETRY_MAX_S) + + +def create_source_idempotent(create_sql: str, source_name: str) -> None: + """Run a CREATE SOURCE statement, tolerating IF-NOT-EXISTS race gaps. + + `CREATE SOURCE IF NOT EXISTS` only short-circuits on the primary source + name. When two driver invocations race past the existence check, or when + a fault-injected crash mid-DDL leaves an orphan `_progress` + subsource in the catalog, the primary create errors with "catalog item + ... already exists" despite `IF NOT EXISTS`. Re-check `mz_sources` after + such an error; if the source landed concurrently, treat as success. + Otherwise re-raise so a true orphan still surfaces. + """ + try: + execute_retry(create_sql) + return + except psycopg.errors.InternalError as exc: + if "already exists" not in str(exc): + raise + rows = query_retry( + "SELECT 1 FROM mz_sources WHERE name = %s", + (source_name,), + ) + if rows: + LOG.info("source %s landed concurrently; tolerating collision", source_name) + return + raise diff --git a/test/antithesis/workload/test/helper_upsert_source.py b/test/antithesis/workload/test/helper_upsert_source.py index e5c8ac1cc6e6a..6fac93cdd4f24 100644 --- a/test/antithesis/workload/test/helper_upsert_source.py +++ b/test/antithesis/workload/test/helper_upsert_source.py @@ -22,7 +22,7 @@ import os from helper_kafka import ensure_topic -from helper_pg import execute_retry +from helper_pg import create_source_idempotent, execute_retry LOG = logging.getLogger("antithesis.helper_upsert_source") @@ -48,11 +48,12 @@ def ensure_upsert_text_source() -> None: """ ensure_kafka_connection() ensure_topic(TOPIC_UPSERT_TEXT) - execute_retry( + create_source_idempotent( f"CREATE SOURCE IF NOT EXISTS {SOURCE_UPSERT_TEXT} " f"IN CLUSTER {CLUSTER} " f"FROM KAFKA CONNECTION {CONNECTION_NAME} (TOPIC '{TOPIC_UPSERT_TEXT}') " f"KEY FORMAT TEXT VALUE FORMAT TEXT " - f"ENVELOPE UPSERT" + f"ENVELOPE UPSERT", + SOURCE_UPSERT_TEXT, ) LOG.info("upsert source %s ready (topic=%s)", SOURCE_UPSERT_TEXT, TOPIC_UPSERT_TEXT) From 520f9087d0d742d4a4b1200dc17e3bf593a954c0 Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Tue, 12 May 2026 00:08:38 -0400 Subject: [PATCH 25/65] test/antithesis: add four workload drivers + reclock SUT anchor for catalog properties --- src/storage/src/source/reclock.rs | 26 +- .../scratchbook/property-catalog.md | 8 +- .../test/anytime_fault_recovery_exercised.py | 183 +++++++++++++ ..._kafka_offset_known_not_below_committed.py | 122 +++++++++ ...nytime_kafka_source_resumes_after_fault.py | 245 ++++++++++++++++++ .../workload/test/helper_table_mv.py | 64 +++++ ...rallel_driver_mv_reflects_table_updates.py | 162 ++++++++++++ 7 files changed, 808 insertions(+), 2 deletions(-) create mode 100755 test/antithesis/workload/test/anytime_fault_recovery_exercised.py create mode 100755 test/antithesis/workload/test/anytime_kafka_offset_known_not_below_committed.py create mode 100755 test/antithesis/workload/test/anytime_kafka_source_resumes_after_fault.py create mode 100644 test/antithesis/workload/test/helper_table_mv.py create mode 100755 test/antithesis/workload/test/parallel_driver_mv_reflects_table_updates.py diff --git a/src/storage/src/source/reclock.rs b/src/storage/src/source/reclock.rs index d4ab5ac4b312b..745115e5dbf72 100644 --- a/src/storage/src/source/reclock.rs +++ b/src/storage/src/source/reclock.rs @@ -10,11 +10,13 @@ /// The `ReclockOperator` observes the progress of a stream that is /// timestamped with some source time `FromTime` and generates bindings that describe how the /// collection should evolve in target time `IntoTime`. +use antithesis_sdk::assert_reachable; use differential_dataflow::consolidation; use differential_dataflow::lattice::Lattice; use mz_persist_client::error::UpperMismatch; use mz_repr::Diff; use mz_storage_client::util::remap_handle::RemapHandle; +use serde_json::json; use timely::order::PartialOrder; use timely::progress::Timestamp; use timely::progress::frontier::{Antichain, AntichainRef, MutableAntichain}; @@ -128,6 +130,12 @@ where upper: self.upper.clone(), }; + // Tracks whether append_batch hit an UpperMismatch during this mint + // invocation. If true and we still exit the while loop normally, + // we've exercised the retry path covered by the catalog property + // `reclock-mint-eventually-succeeds`. + let mut cas_retry_count: u64 = 0; + while *self.upper == [IntoTime::minimum()] || (PartialOrder::less_equal(&self.source_upper.frontier(), &new_from_upper) && PartialOrder::less_than(&self.upper, &new_into_upper) @@ -159,12 +167,28 @@ where let new_batch = match self.append_batch(updates, &new_into_upper).await { Ok(trace_batch) => trace_batch, - Err(UpperMismatch { current, .. }) => self.sync(current.borrow()).await, + Err(UpperMismatch { current, .. }) => { + cas_retry_count = cas_retry_count.saturating_add(1); + self.sync(current.borrow()).await + } }; batch.updates.extend(new_batch.updates); batch.upper = new_batch.upper; } + // Reachability anchor for `reclock-mint-eventually-succeeds`: this + // line fires only when a CaS UpperMismatch was observed and the + // mint loop nonetheless terminated. That's the path the catalog + // wants Antithesis to observe at least once per run; reaching it + // is the signal, so the marker is unconditional `assert_reachable!` + // rather than `assert_sometimes!(true, …)`. + if cas_retry_count > 0 { + assert_reachable!( + "reclock: mint completed after at least one compare_and_append UpperMismatch", + &json!({"cas_retry_count": cas_retry_count}) + ); + } + batch } diff --git a/test/antithesis/scratchbook/property-catalog.md b/test/antithesis/scratchbook/property-catalog.md index 9e94cdf8ed089..8f3e2a2563d74 100644 --- a/test/antithesis/scratchbook/property-catalog.md +++ b/test/antithesis/scratchbook/property-catalog.md @@ -1,6 +1,6 @@ --- commit: 007c7af9d9970fb2030c7212368b232e0fbc363e -updated: 2026-05-11 +updated: 2026-05-12 --- # Property Catalog: Materialize @@ -189,6 +189,7 @@ Properties that verify the system reaches interesting states under fault injecti |---|---| | **Type** | Liveness | | **Priority** | P0 — most fundamental operational property; prerequisite for all others | +| **Status** | **Implemented (workload-side)** — `test/antithesis/workload/test/anytime_fault_recovery_exercised.py`. Anytime driver probes `SELECT 1` with a short connect timeout (bypassing helper_pg's retry budget so the fault-active window is observable) and records `sometimes("...succeeded after a previously-observed connect failure", …)` for the recovery transition, plus corroborating `sometimes` anchors for "observed replica non-online" and "at least one probe succeeded this invocation". | | **Property** | After the coordinator (environmentd) crashes and restarts, the system eventually becomes healthy (readiness endpoint returns 200) and can serve SQL queries. | | **Invariant** | `Sometimes(healthy_after_crash)`: the system must reach a state where it can serve queries after a crash. This confirms recovery works end-to-end, not just in unit tests. | | **Antithesis Angle** | Kill environmentd at various points during operation. Verify it restarts, reconnects to persist, recovers catalog, and serves queries. Antithesis explores crash timing — during DDL, during peek, during group_commit. | @@ -211,6 +212,7 @@ Properties that verify the system reaches interesting states under fault injecti |---|---| | **Type** | Liveness | | **Priority** | P1 — end-to-end user-visible correctness; Materialize's core value | +| **Status** | **Implemented (workload-side, table-backed)** — `test/antithesis/workload/test/parallel_driver_mv_reflects_table_updates.py` + `helper_table_mv.py`. Each invocation inserts N rows tagged with a per-invocation prefix into `mv_input_table`, polls the rolling-count MV `mv_input_count` after a quiet period, and pairs `sometimes("mv: row_count caught up …", …)` (liveness anchor) with `always("mv: row_count equals inserted count …", …)` (safety on the settled count). Kafka-source-backed MV is covered indirectly by the Kafka-source drivers — direct MV-on-Kafka-source coverage is deferred. | | **Property** | After data is written to a source, materialized views that depend on that source eventually reflect the new data. | | **Invariant** | `Sometimes(mv_contains_new_data)`: after inserting data into a table or producing to a Kafka source, a SELECT on a dependent materialized view must eventually return the new data. | | **Antithesis Angle** | Insert data, inject faults (compute replica crash, storage reconnection), then verify the MV eventually shows the data. Antithesis explores whether faults during the incremental update pipeline cause permanent stalls. | @@ -262,6 +264,7 @@ Properties specific to the Kafka source ingestion pipeline: `KafkaSourceReader` |---|---| | **Type** | Liveness | | **Priority** | P1 — operational expectation; broker faults are a routine condition | +| **Status** | **Implemented (workload-side, shared driver)** — `test/antithesis/workload/test/anytime_kafka_source_resumes_after_fault.py`. Continuous polling state machine per Kafka source: `OBSERVING` -> `STALLED` after N consecutive identical `offset_committed` samples, then `Reachable("...resumed advancing after a sustained stall", …)` on the first strictly-greater sample. The driver tags each recovery with `saw_kafka_metadata_failure` (broker-fault signal) and `saw_replica_non_online` (clusterd-restart signal) so triage can distinguish the two fault classes. | | **Property** | After a transient network partition or Kafka broker outage that prevents the source from making progress, once connectivity is restored, the source eventually ingests all messages that were produced during the outage. | | **Invariant** | `Sometimes(source_resumes_after_broker_fault)`: at least once per run, after injecting a network fault between materialized and Kafka and then calling `ANTITHESIS_STOP_FAULTS`, the workload observes the source's `COUNT(*)` advance past its pre-fault value. | | **Antithesis Angle** | Network partition between the `materialized` container and the Kafka container; persist+metadata stay reachable. Tests rdkafka reconnect, snapshot statistics restoration (commit 0a34b6c79d), and that no permanent stall mode is entered. | @@ -273,6 +276,7 @@ Properties specific to the Kafka source ingestion pipeline: `KafkaSourceReader` |---|---| | **Type** | Liveness | | **Priority** | P1 — recovery from clusterd kill is the most common operational fault path | +| **Status** | **Implemented (workload-side, shared driver)** — same `test/antithesis/workload/test/anytime_kafka_source_resumes_after_fault.py` as `kafka-source-survives-broker-fault`. The stall-then-advance transition is fault-kind-agnostic; `saw_replica_non_online` corroborates that the source recovered specifically from a clusterd kill. Combines with the existing `kafka-source-no-data-duplication` and `kafka-source-no-data-loss` assertions to also rule out double-counting and gaps on the rehydrated path. Requires node-termination faults to be enabled in the Antithesis tenant. | | **Property** | After clusterd (storage worker) is killed and restarted, the Kafka source recovers, replays the right resume offsets, and ingests messages produced before, during, and after the restart. | | **Invariant** | `Sometimes(source_recovered_after_clusterd_restart)`: after a kill+restart, eventually `COUNT(*) FROM source >= produced_count`. Combined with `kafka-source-no-data-duplication` to also rule out double-counting. | | **Antithesis Angle** | Direct test of the `storage-command-replay-idempotent` mechanism end-to-end through Kafka. Antithesis explores crash timing across the reclock mint, persist-sink append, and upsert snapshot-completion windows. Requires node-termination faults to be enabled. | @@ -390,6 +394,7 @@ Properties specific to the Kafka source ingestion pipeline: `KafkaSourceReader` |---|---| | **Type** | Liveness | | **Priority** | P2 — pre-existing concern under persist instability | +| **Status** | **Implemented (SUT-side anchor)** — `src/storage/src/source/reclock.rs`: `ReclockOperator::mint` carries a local `cas_retry_count` and fires `assert_reachable!("reclock: mint completed after at least one compare_and_append UpperMismatch", …)` after the while-loop terminates when at least one `UpperMismatch` was observed. The reachability anchor covers the "retry path was exercised AND mint terminated" half of the property. The workload-side "source frontier advanced past the contention point" liveness check is approximated by the existing `anytime_kafka_frontier_monotonic.py` + `anytime_kafka_source_resumes_after_fault.py` drivers and is not duplicated here. | | **Property** | Under transient persist outages or competing writers, the reclock mint loop (`compare_and_append` with `UpperMismatch` retry, reclock.rs:160-166) eventually completes for every source-frontier advance that has data to bind. | | **Invariant** | `Sometimes(mint_completed_after_cas_retry)`: at least once per run, Antithesis observes a reclock mint that took >1 CaS attempt and then completed (i.e. a successful retry path was exercised). Critically, the workload should also observe that the source frontier eventually advances past the value of `source_upper` captured at the time of the contention — i.e. the loop is not livelocked. | | **Antithesis Angle** | Inject persist consensus latency, kill+restart concurrently to create a competing writer, race the metadata fetcher's partition-add against a mint that is already in flight. The retry loop in `mint()` has no upper bound; this property confirms it is not livelocked even under adversarial schedules. | @@ -401,6 +406,7 @@ Properties specific to the Kafka source ingestion pipeline: `KafkaSourceReader` |---|---| | **Type** | Safety | | **Priority** | P2 — observable statistics correctness; regression target for commit 3e32df1f69 | +| **Status** | **Implemented (workload-side)** — `test/antithesis/workload/test/anytime_kafka_offset_known_not_below_committed.py`. Continuous polling driver queries every Kafka source's `mz_source_statistics_per_worker` row and fires `always("kafka: source offset_known < offset_committed", …)` whenever a single per-worker row has `offset_known < offset_committed`. Both fields are read from the same row of the same query so the comparison cannot cross a metric-update boundary. The SUT-side mirror in `src/storage/src/statistics.rs` is deferred. | | **Property** | For every Kafka source, the source-statistics view always reports `offset_known >= offset_committed`. The metric `offset_known` reflects what the broker has told us is available; `offset_committed` reflects what Materialize has durably ingested. Causally, `offset_known` cannot lag `offset_committed`. | | **Invariant** | `Always`: a polling assertion in the workload — `SELECT offset_known, offset_committed FROM mz_internal.mz_source_statistics_per_worker WHERE id = ?` — invariant `offset_known >= offset_committed`. Mirror as an `assert_always!` inside the statistics update path in `src/storage/src/statistics.rs`. | | **Antithesis Angle** | Clusterd restart resets `offset_known` to broker-reported watermark while `offset_committed` is restored from persist. If the restoration order is wrong, the invariant flips. Direct regression target for commit 3e32df1f69. | diff --git a/test/antithesis/workload/test/anytime_fault_recovery_exercised.py b/test/antithesis/workload/test/anytime_fault_recovery_exercised.py new file mode 100755 index 0000000000000..143dd8c103dce --- /dev/null +++ b/test/antithesis/workload/test/anytime_fault_recovery_exercised.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Antithesis driver for `fault-recovery-exercised`. + +The most fundamental liveness property in the catalog: after the system +takes a hit from Antithesis fault injection, it must eventually come back +and serve SQL again. The catalog frames this in terms of the `/health/ready` +endpoint returning 200; this workload uses `SELECT 1` (the cheapest +end-to-end pgwire round trip) as the proxy, and observes the cluster +replica status as a corroborating signal. + +Approach: + - Probe `materialized` with a *short-budget* psycopg connect on every + tick. Long retry budgets in `helper_pg` would mask the actual + fault-active periods we want to detect — here we want to observe the + transitions. + - Track per-tick state: was this probe a success or a connect/query + failure? + - If we observe a failure at tick T and a success at tick T+k (any k>=1) + within this invocation, that is the recovery transition we care about, + and we fire `sometimes("...query succeeded after observed fault")`. + + - Separately, fire `sometimes("...observed cluster replica non-online")` + when `mz_cluster_replica_statuses` reports any antithesis replica + `offline`. This is a corroborating signal so triage can distinguish + "no fault ever landed" from "faults landed but no recovery observed." + +This is an `anytime_` driver — Antithesis launches it many times, each +short-lived. Recovery transitions accumulate across invocations. +""" + +from __future__ import annotations + +import logging +import os +import sys +import time + +import psycopg +from helper_pg import ( + PGDATABASE, + PGHOST, + PGPORT, + PGUSER, + query_one_retry, +) + +from antithesis.assertions import sometimes + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" +) +LOG = logging.getLogger("driver.fault_recovery_exercised") + +POLL_INTERVAL_S = 0.5 +RUN_BUDGET_S = 30.0 +PROBE_CONNECT_TIMEOUT_S = 2.0 + +ANTITHESIS_CLUSTER = "antithesis_cluster" + + +def _probe_select_one() -> bool: + """Run `SELECT 1` with a short connect timeout. Return True on success. + + Distinct from the resilient `helper_pg.query_*` paths because we *want* + to observe transient failures here — they are the fault-active half of + the recovery transition we are looking for. + """ + try: + with psycopg.connect( + host=PGHOST, + port=PGPORT, + user=PGUSER, + dbname=PGDATABASE, + connect_timeout=int(PROBE_CONNECT_TIMEOUT_S), + autocommit=True, + ) as conn, conn.cursor() as cur: + cur.execute("SELECT 1") + row = cur.fetchone() + return row is not None and row[0] == 1 + except Exception: # noqa: BLE001 + return False + + +def _replica_non_online() -> bool: + """Best-effort: is any antithesis-cluster replica reporting non-online? + + Uses the retry-budgeted query helper because we want a clear yes/no, not + a probe outcome — if the helper can't get an answer we conservatively + return False so the corroborating signal stays silent rather than + accidentally firing on a probe-side failure. + """ + try: + row = query_one_retry( + """ + SELECT EXISTS ( + SELECT 1 + FROM mz_internal.mz_cluster_replica_statuses s + JOIN mz_cluster_replicas r ON r.id = s.replica_id + JOIN mz_clusters c ON c.id = r.cluster_id + WHERE c.name = %s AND s.status != 'online' + ) + """, + (ANTITHESIS_CLUSTER,), + ) + except Exception: # noqa: BLE001 + return False + return bool(row and row[0]) + + +def main() -> int: + deadline = time.monotonic() + RUN_BUDGET_S + + # Per-invocation state. The driver is short-lived; Antithesis covers the + # full timeline by launching many invocations. + saw_failure = False + saw_recovery_after_failure = False + saw_replica_non_online = False + successes = 0 + failures = 0 + + while time.monotonic() < deadline: + ok = _probe_select_one() + if ok: + successes += 1 + if saw_failure: + saw_recovery_after_failure = True + else: + failures += 1 + saw_failure = True + + if _replica_non_online(): + saw_replica_non_online = True + + time.sleep(POLL_INTERVAL_S) + + sometimes( + saw_recovery_after_failure, + "fault recovery: SELECT 1 succeeded after a previously-observed connect failure", + { + "successes": successes, + "failures": failures, + "saw_replica_non_online": saw_replica_non_online, + }, + ) + sometimes( + saw_replica_non_online, + "fault recovery: observed antithesis_cluster replica non-online at least once", + {"successes": successes, "failures": failures}, + ) + # Bare-minimum healthy-coverage signal: at least one successful probe in + # the invocation. If this ever goes 0/N across a run, no driver was + # ever able to talk to Materialize and the entire test is suspect — + # downstream property assertions would be vacuous. + sometimes( + successes > 0, + "fault recovery: at least one SELECT 1 succeeded this invocation", + {"successes": successes, "failures": failures}, + ) + + LOG.info( + "fault-recovery probe done; successes=%d failures=%d recovery=%s replica_offline=%s", + successes, + failures, + saw_recovery_after_failure, + saw_replica_non_online, + ) + return 0 + + +if __name__ == "__main__": + # Reference PGUSER/PGPORT/PGHOST/PGDATABASE so static analysis sees them + # used through helper_pg's re-export rather than as dead imports. + _ = (PGHOST, PGPORT, PGUSER, PGDATABASE, os) + sys.exit(main()) diff --git a/test/antithesis/workload/test/anytime_kafka_offset_known_not_below_committed.py b/test/antithesis/workload/test/anytime_kafka_offset_known_not_below_committed.py new file mode 100755 index 0000000000000..9801c4dfa65b7 --- /dev/null +++ b/test/antithesis/workload/test/anytime_kafka_offset_known_not_below_committed.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Antithesis driver for `offset-known-not-below-committed`. + +For every Kafka source, `mz_internal.mz_source_statistics_per_worker` must +always report `offset_known >= offset_committed`. `offset_known` reflects +what the broker has told us is available; `offset_committed` reflects what +Materialize has durably ingested. Causally, the broker's idea of "this +offset exists" cannot lag what we've already durably read past it. Direct +regression target for commit 3e32df1f69, which clamped the metric to +prevent this flip on the first sample after a clusterd restart. + +This is an `anytime_` driver — it runs continuously throughout the timeline +under active fault injection. The interesting timing per the catalog is the +very first sample after a clusterd restart, where `offset_known` is +restored from the broker watermark while `offset_committed` is restored +from persist; we want Antithesis to drop a poll into that window. + +Both fields are read in the same row of the same SELECT so the comparison +never crosses a metric-update boundary. The per-worker view is queried +(not the rolled-up `mz_source_statistics`) because the invariant must hold +per worker — averaging would mask a single worker that crossed the line. + +Errors during polling (clusterd down, network partitioned) are *expected* +under fault injection and must not produce false-positive failures; we +just skip the sample. +""" + +from __future__ import annotations + +import logging +import sys +import time + +from helper_pg import query_retry + +from antithesis.assertions import always + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" +) +LOG = logging.getLogger("driver.kafka_offset_known_not_below_committed") + +POLL_INTERVAL_S = 0.5 +RUN_BUDGET_S = 30.0 + +ANTITHESIS_CLUSTER = "antithesis_cluster" + + +def _samples() -> list[tuple[str, int, int, int]]: + """Return (source_name, worker_id, offset_known, offset_committed) per worker. + + Joins `mz_source_statistics_per_worker` to `mz_sources` so the assertion + `details` can name the source by name rather than by opaque id. Filters + to Kafka sources owned by the antithesis cluster so the assertion does + not fire against the introspection cluster's bookkeeping sources. + + Rows with NULL `offset_known` or `offset_committed` are dropped — those + are early-lifetime samples that have not been populated yet. + """ + rows = query_retry( + """ + SELECT + s.name, + ss.worker_id::bigint, + ss.offset_known::bigint, + ss.offset_committed::bigint + FROM mz_internal.mz_source_statistics_per_worker ss + JOIN mz_sources s ON s.id = ss.id + JOIN mz_clusters c ON c.id = s.cluster_id + WHERE c.name = %s + AND s.type = 'kafka' + AND ss.offset_known IS NOT NULL + AND ss.offset_committed IS NOT NULL + """, + (ANTITHESIS_CLUSTER,), + ) + return [(str(n), int(w), int(k), int(o)) for (n, w, k, o) in rows] + + +def main() -> int: + deadline = time.monotonic() + RUN_BUDGET_S + polled = 0 + + while time.monotonic() < deadline: + try: + samples = _samples() + except Exception as exc: # noqa: BLE001 + LOG.info("source stats query failed: %s; sleeping and retrying", exc) + time.sleep(POLL_INTERVAL_S) + continue + + for source, worker, known, committed in samples: + always( + known >= committed, + "kafka: source offset_known < offset_committed", + { + "source": source, + "worker_id": worker, + "offset_known": known, + "offset_committed": committed, + "deficit": committed - known, + }, + ) + polled += 1 + + time.sleep(POLL_INTERVAL_S) + + LOG.info("offset_known-not-below-committed check done; %d samples", polled) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/antithesis/workload/test/anytime_kafka_source_resumes_after_fault.py b/test/antithesis/workload/test/anytime_kafka_source_resumes_after_fault.py new file mode 100755 index 0000000000000..85042a317d7cb --- /dev/null +++ b/test/antithesis/workload/test/anytime_kafka_source_resumes_after_fault.py @@ -0,0 +1,245 @@ +#!/usr/bin/env python3 +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Antithesis driver for `kafka-source-survives-broker-fault` and +`kafka-source-survives-clusterd-restart` (combined liveness signal). + +Both catalog properties amount to: after a transient fault that prevents +the source from making progress, once the fault is over the source must +ingest the messages it was unable to read during the outage. Externally +this looks identical for either fault kind — `offset_committed` stalls +during the outage and resumes advancing afterward — so one anytime driver +records the stall-then-advance transition and we tag the corroborating +fault signal (kafka broker reachable / replica online) in `details` so +triage can distinguish the two cases on a hit. + +Per-invocation state machine, per source: + - `IDLE` (initial). On a successful sample, store the offset and move + to `OBSERVING`. + - `OBSERVING`. If the sample equals the stored value for STALL_TICKS + consecutive ticks, move to `STALLED` (the source has stopped + progressing — most likely fault-induced). Otherwise, refresh the + stored value. + - `STALLED`. On any sample strictly greater than the stalled value, fire + the `sometimes(...)` recovery anchor and return to `OBSERVING` with + the new value. Otherwise stay stalled. + +Failed samples (clusterd unavailable, network partition) do not transition +the state machine — they are the fault-active condition we want to bridge +over. They are counted only so the `details` payload can corroborate the +recovery transition. + +The driver also records two corroborating `sometimes(...)` signals so +triage can confirm Antithesis actually hit each of the two fault classes +this property cluster cares about: + - replica went non-online (clusterd-restart signal) + - direct Kafka admin metadata fetch failed (broker-fault signal) +""" + +from __future__ import annotations + +import logging +import os +import sys +import time + +from helper_pg import query_one_retry, query_retry + +from antithesis.assertions import reachable, sometimes + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" +) +LOG = logging.getLogger("driver.kafka_source_resumes_after_fault") + +POLL_INTERVAL_S = 1.0 +RUN_BUDGET_S = 45.0 +# Number of consecutive identical samples after which we consider the source +# "stalled" rather than just briefly idle. Five seconds (5 ticks * 1s) +# comfortably exceeds the natural quiet-period between produces but is well +# below the fault-injection windows Antithesis schedules. +STALL_TICKS = 5 + +ANTITHESIS_CLUSTER = "antithesis_cluster" +KAFKA_BROKER = os.environ.get("KAFKA_BROKER", "kafka:9092") + + +def _kafka_sources() -> list[str]: + rows = query_retry( + """ + SELECT s.name + FROM mz_sources s + JOIN mz_clusters c ON c.id = s.cluster_id + WHERE c.name = %s AND s.type = 'kafka' + """, + (ANTITHESIS_CLUSTER,), + ) + return [r[0] for r in rows] + + +def _offset_committed(source_name: str) -> int | None: + """Aggregated offset_committed across workers for `source_name`.""" + row = query_one_retry( + """ + SELECT MAX(ss.offset_committed)::bigint + FROM mz_internal.mz_source_statistics ss + JOIN mz_sources s ON s.id = ss.id + WHERE s.name = %s + """, + (source_name,), + ) + if row is None or row[0] is None: + return None + return int(row[0]) + + +def _replica_non_online() -> bool: + try: + row = query_one_retry( + """ + SELECT EXISTS ( + SELECT 1 + FROM mz_internal.mz_cluster_replica_statuses s + JOIN mz_cluster_replicas r ON r.id = s.replica_id + JOIN mz_clusters c ON c.id = r.cluster_id + WHERE c.name = %s AND s.status != 'online' + ) + """, + (ANTITHESIS_CLUSTER,), + ) + except Exception: # noqa: BLE001 + return False + return bool(row and row[0]) + + +def _kafka_metadata_failed() -> bool: + """Best-effort: did a direct Kafka metadata fetch fail? + + A successful Materialize-side ingestion still goes through the broker, + so a metadata fetch failure here is a strong signal that the + `materialized <-> kafka` channel was partitioned even though the + `materialized <-> postgres-metadata` channel still works (the + `kafka-source-survives-broker-fault` shape). + + Defensive imports because the kafka admin client only runs cleanly with + a reachable broker. We avoid raising into the polling loop. + """ + try: + from confluent_kafka.admin import AdminClient + except Exception: # noqa: BLE001 + return False + try: + AdminClient({"bootstrap.servers": KAFKA_BROKER}).list_topics(timeout=2) + return False + except Exception: # noqa: BLE001 + return True + + +def main() -> int: + deadline = time.monotonic() + RUN_BUDGET_S + + # Per-source state machine. + # state: "OBSERVING" or "STALLED" + # last_value: most recent committed offset observed + # stall_streak: consecutive ticks at last_value + states: dict[str, dict] = {} + + # Cross-source corroborating signals collected throughout this run. + saw_replica_non_online = False + saw_kafka_metadata_failure = False + # Per-source: did we observe stall->advance at least once. + resumed_after_stall: dict[str, bool] = {} + + while time.monotonic() < deadline: + if _replica_non_online(): + saw_replica_non_online = True + if _kafka_metadata_failed(): + saw_kafka_metadata_failure = True + + try: + sources = _kafka_sources() + except Exception as exc: # noqa: BLE001 + LOG.info("source list query failed: %s; sleeping", exc) + time.sleep(POLL_INTERVAL_S) + continue + + for source in sources: + try: + observed = _offset_committed(source) + except Exception as exc: # noqa: BLE001 + LOG.info("offset_committed query failed for %s: %s", source, exc) + continue + if observed is None: + continue + + st = states.setdefault( + source, + {"state": "OBSERVING", "last_value": observed, "stall_streak": 0}, + ) + + if st["state"] == "OBSERVING": + if observed == st["last_value"]: + st["stall_streak"] += 1 + if st["stall_streak"] >= STALL_TICKS: + st["state"] = "STALLED" + else: + # Progress: reset. + st["last_value"] = observed + st["stall_streak"] = 0 + else: # STALLED + if observed > st["last_value"]: + # Recovery transition: fire the per-source signal once + # per invocation (we still update state so we can detect + # additional stalls and resumes). + if not resumed_after_stall.get(source, False): + resumed_after_stall[source] = True + # Reaching here is the property: a source was stalled, + # then advanced. Use `reachable(...)` rather than + # `sometimes(True, ...)` per the SDK assertion-type + # guidance. + reachable( + "kafka source: offset_committed resumed advancing after a sustained stall", + { + "source": source, + "stalled_at": st["last_value"], + "observed_after_recovery": observed, + "stall_ticks_required": STALL_TICKS, + "saw_replica_non_online": saw_replica_non_online, + "saw_kafka_metadata_failure": saw_kafka_metadata_failure, + }, + ) + st["state"] = "OBSERVING" + st["last_value"] = observed + st["stall_streak"] = 0 + + time.sleep(POLL_INTERVAL_S) + + sometimes( + saw_replica_non_online, + "kafka source resumes: observed antithesis_cluster replica non-online", + {"resumed_sources": sorted(resumed_after_stall.keys())}, + ) + sometimes( + saw_kafka_metadata_failure, + "kafka source resumes: observed direct Kafka metadata fetch failure", + {"resumed_sources": sorted(resumed_after_stall.keys())}, + ) + + LOG.info( + "kafka-source-resumes-after-fault done; sources_resumed=%d replica_offline=%s metadata_failed=%s", + sum(1 for v in resumed_after_stall.values() if v), + saw_replica_non_online, + saw_kafka_metadata_failure, + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/antithesis/workload/test/helper_table_mv.py b/test/antithesis/workload/test/helper_table_mv.py new file mode 100644 index 0000000000000..e865f3f2f5e89 --- /dev/null +++ b/test/antithesis/workload/test/helper_table_mv.py @@ -0,0 +1,64 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Idempotent setup for the Antithesis table + materialized view scaffolding. + +Used by the `mv-reflects-source-updates` driver. The table holds rows with a +per-invocation `prefix` so concurrent driver instances scope to disjoint +groups, and the materialized view rolls those rows up by prefix: + + CREATE TABLE mv_input_table (id BIGINT NOT NULL, prefix TEXT NOT NULL); + CREATE MATERIALIZED VIEW mv_input_count AS + SELECT prefix, COUNT(*)::BIGINT AS row_count + FROM mv_input_table + GROUP BY prefix; + +Defining the MV on the local coordinator's table (rather than a Kafka +source) deliberately tests the end-to-end path independent of source +ingestion: dataflow rendering, persist write of the MV output, and +frontier advancement through compute. Source-side faults are still +exercised because the workload runs under the same fault-injection regime +as everything else. +""" + +from __future__ import annotations + +import logging +import os + +from helper_pg import execute_retry + +LOG = logging.getLogger("antithesis.helper_table_mv") + +CLUSTER = os.environ.get("MZ_ANTITHESIS_CLUSTER", "antithesis_cluster") + +TABLE_MV_INPUT = "mv_input_table" +MV_NAME = "mv_input_count" + + +def ensure_table_and_mv() -> None: + """Create the input table and the materialized view if absent. + + Both DDLs use IF NOT EXISTS so concurrent driver instances racing + through setup do not collide. The MV is created in the antithesis + cluster so dataflow execution is colocated with the rest of the + workload's compute. + """ + execute_retry( + f"CREATE TABLE IF NOT EXISTS {TABLE_MV_INPUT} " + f"(id BIGINT NOT NULL, prefix TEXT NOT NULL)" + ) + execute_retry( + f"CREATE MATERIALIZED VIEW IF NOT EXISTS {MV_NAME} " + f"IN CLUSTER {CLUSTER} AS " + f"SELECT prefix, COUNT(*)::BIGINT AS row_count " + f"FROM {TABLE_MV_INPUT} " + f"GROUP BY prefix" + ) + LOG.info("table %s and MV %s ready", TABLE_MV_INPUT, MV_NAME) diff --git a/test/antithesis/workload/test/parallel_driver_mv_reflects_table_updates.py b/test/antithesis/workload/test/parallel_driver_mv_reflects_table_updates.py new file mode 100755 index 0000000000000..c026be09ea522 --- /dev/null +++ b/test/antithesis/workload/test/parallel_driver_mv_reflects_table_updates.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python3 +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Antithesis driver for `mv-reflects-source-updates`. + +End-to-end user-visible property: after data is written to an upstream +collection, materialized views that depend on that collection eventually +reflect the new data. Materialize's headline value proposition. + +This driver uses a TABLE (not a Kafka source) so the property is exercised +independent of source ingestion: the test path is INSERT -> coordinator +group_commit -> persist write of the table -> MV's compute dataflow -> +persist write of the MV output -> SELECT. Kafka-source-specific liveness +is covered by the other Kafka-source drivers. + +Each invocation: + 1. Ensures `mv_input_table` + materialized view `mv_input_count` exist. + 2. Picks a per-invocation prefix so concurrent driver instances scope to + disjoint MV rows. + 3. INSERTs N rows tagged with the prefix. + 4. Requests an Antithesis quiet period and polls the MV until the count + for the prefix equals N. + 5. Asserts: + - `always(...)` the MV count matches what was inserted (no over- or + under-counting after settle). + - `sometimes(...)` the catchup completed within the budget (the + liveness anchor — without this, the always check could be vacuous + on a slow-catchup invocation). + +This is a `parallel_driver_` — many concurrent instances exercise the MV +without colliding because each invocation owns its prefix range. +""" + +from __future__ import annotations + +import logging +import sys +import time + +import helper_random +from helper_pg import execute_retry, query_one_retry +from helper_quiet import request_quiet_period +from helper_table_mv import MV_NAME, TABLE_MV_INPUT, ensure_table_and_mv + +from antithesis.assertions import always, sometimes + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" +) +LOG = logging.getLogger("driver.mv_reflects_table_updates") + +INSERTS_PER_INVOCATION = 40 +QUIET_PERIOD_S = 20 +CATCHUP_TIMEOUT_S = 60.0 +CATCHUP_POLL_INTERVAL_S = 0.5 + + +def _mv_count_for_prefix(prefix: str) -> int | None: + """Return the row_count the MV currently reports for `prefix`, or None. + + None means "no row exists for that prefix yet" — distinct from zero, + which the MV would not produce for the `count(*)`+`group by` shape (a + fully-deleted prefix would not appear at all). + """ + row = query_one_retry( + f"SELECT row_count::bigint FROM {MV_NAME} WHERE prefix = %s", + (prefix,), + ) + if row is None: + return None + return int(row[0]) + + +def main() -> int: + ensure_table_and_mv() + + prefix = f"p{helper_random.random_u64():016x}" + LOG.info("mv driver starting; prefix=%s", prefix) + + # Insert N rows tagged with the prefix. We batch into a single statement + # so the coordinator processes them as one group_commit, which keeps the + # workload-visible target offset for catchup well-defined (otherwise a + # mid-insert crash would split the row count and the MV would catch up + # to "some" count rather than exactly N). + placeholders = ", ".join(["(%s, %s)"] * INSERTS_PER_INVOCATION) + params: list[object] = [] + for i in range(INSERTS_PER_INVOCATION): + params.extend([i, prefix]) + execute_retry( + f"INSERT INTO {TABLE_MV_INPUT} (id, prefix) VALUES {placeholders}", + params, + ) + + request_quiet_period(QUIET_PERIOD_S) + + # Poll the MV until the row_count for this prefix reaches N. The MV's + # `COUNT(*) GROUP BY prefix` shape means the row for this prefix may + # appear partially populated during the catchup window. + deadline = time.monotonic() + CATCHUP_TIMEOUT_S + observed = _mv_count_for_prefix(prefix) + while observed != INSERTS_PER_INVOCATION and time.monotonic() < deadline: + time.sleep(CATCHUP_POLL_INTERVAL_S) + observed = _mv_count_for_prefix(prefix) + + caught_up = observed == INSERTS_PER_INVOCATION + + sometimes( + caught_up, + "mv: row_count caught up to inserted count after quiet period", + { + "mv": MV_NAME, + "table": TABLE_MV_INPUT, + "prefix": prefix, + "expected": INSERTS_PER_INVOCATION, + "observed": observed, + }, + ) + + if not caught_up: + LOG.info( + "catchup did not complete in budget; skipping safety assertion " + "(observed=%s expected=%d)", + observed, + INSERTS_PER_INVOCATION, + ) + return 0 + + # Safety check: the MV must report exactly the inserted count. A + # higher count would be double-counting (corruption); a lower count + # at this point would mean the catchup poll above gave us a stale + # read between observations, which is itself a correctness bug worth + # surfacing. + always( + observed == INSERTS_PER_INVOCATION, + "mv: row_count equals inserted count for prefix after settle", + { + "mv": MV_NAME, + "table": TABLE_MV_INPUT, + "prefix": prefix, + "expected": INSERTS_PER_INVOCATION, + "observed": observed, + }, + ) + + LOG.info( + "mv driver done; inserted=%d mv_count=%s prefix=%s", + INSERTS_PER_INVOCATION, + observed, + prefix, + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From 7c026caeb4de1cf4c70602e7f4f5f0a8d3029db0 Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Tue, 12 May 2026 00:25:31 -0400 Subject: [PATCH 26/65] test/antithesis: persist-cas-monotonicity SUT anchor + strict-serializable-reads workload driver --- Cargo.lock | 1 + src/persist-client/Cargo.toml | 1 + src/persist-client/src/internal/apply.rs | 18 ++ .../scratchbook/property-catalog.md | 2 + ...rallel_driver_strict_serializable_reads.py | 225 ++++++++++++++++++ 5 files changed, 247 insertions(+) create mode 100755 test/antithesis/workload/test/parallel_driver_strict_serializable_reads.py diff --git a/Cargo.lock b/Cargo.lock index 2f4eed40b37c2..78cfc5d7ecd59 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7203,6 +7203,7 @@ dependencies = [ name = "mz-persist-client" version = "26.25.0-dev.0" dependencies = [ + "antithesis_sdk", "anyhow", "arrayvec 0.7.6", "arrow", diff --git a/src/persist-client/Cargo.toml b/src/persist-client/Cargo.toml index 0fad73a172d71..0d2b068964372 100644 --- a/src/persist-client/Cargo.toml +++ b/src/persist-client/Cargo.toml @@ -28,6 +28,7 @@ name = "benches" harness = false [dependencies] +antithesis_sdk.workspace = true anyhow.workspace = true arrayvec.workspace = true arrow.workspace = true diff --git a/src/persist-client/src/internal/apply.rs b/src/persist-client/src/internal/apply.rs index a48982ff77eb9..5085b24b3d6fb 100644 --- a/src/persist-client/src/internal/apply.rs +++ b/src/persist-client/src/internal/apply.rs @@ -15,6 +15,9 @@ use std::ops::ControlFlow::{self, Break, Continue}; use std::sync::Arc; use std::time::Instant; +use antithesis_sdk::assert_always_greater_than; +use serde_json::json; + use crate::cache::{LockingTypedState, StateCache}; use crate::error::{CodecMismatch, InvalidUsage}; use crate::internal::gc::GcReq; @@ -598,6 +601,21 @@ where } } + // Antithesis-reportable form of the broader `persist-cas-monotonicity` + // catalog property: SeqNo must strictly increase across any committed + // state transition. The narrower equality check below (next == seqno) + // still panics on violation and stays in place to catch skip/regress + // in the same call. + assert_always_greater_than!( + new_state.seqno().0, + expected.0, + "persist: state seqno did not strictly increase across CaS apply", + &json!({ + "expected_prev": expected.0, + "computed_next": new_state.seqno().0, + "cmd": cmd.name, + }) + ); assert_eq!( expected.next(), new_state.seqno(), diff --git a/test/antithesis/scratchbook/property-catalog.md b/test/antithesis/scratchbook/property-catalog.md index 8f3e2a2563d74..746796228d668 100644 --- a/test/antithesis/scratchbook/property-catalog.md +++ b/test/antithesis/scratchbook/property-catalog.md @@ -26,6 +26,7 @@ Properties that verify data correctness when crashes, network partitions, and co |---|---| | **Type** | Safety | | **Priority** | P0 — backbone of persist consistency; all other persist properties depend on this | +| **Status** | **Implemented (SUT-side)** — `src/persist-client/src/internal/apply.rs`: alongside the existing `assert_eq!(expected.next(), new_state.seqno(), …)` strict-increment check in `compute_next_state_locked`, an `assert_always_greater_than!(new_state.seqno().0, expected.0, "persist: state seqno did not strictly increase across CaS apply", …)` makes the broader monotonicity invariant a reportable Antithesis property rather than only a process panic. The strict-equality `assert_eq!` is retained so the narrower invariant (next == seqno) still surfaces. The companion rollup-seqno invariant (`state.rs:1324` doc comment) is deferred. | | **Property** | Persist shard state versions (SeqNo) form a strictly increasing sequence. No writer can observe or apply a lower SeqNo after observing a higher one. | | **Invariant** | `Always`: for any shard, if SeqNo N is observed, no subsequent observation returns SeqNo < N. Rollups maintain seqno <= seqno_since. This must hold on every check — a single violation means state corruption. | | **Antithesis Angle** | Partition storage from persist backend mid-write. One writer races to increment SeqNo while another caches an old value and retries. Crash during GC/rollup operations. Antithesis explores interleaving of concurrent CaS loops. | @@ -74,6 +75,7 @@ Properties that verify Materialize's strict serializability guarantee and timest |---|---| | **Type** | Safety | | **Priority** | P0 — Materialize's core advertised guarantee; user-visible | +| **Status** | **Implemented (workload-side)** — `test/antithesis/workload/test/parallel_driver_strict_serializable_reads.py`. Inserts one row per step into `mv_input_table` and, between steps, opens a *fresh* psycopg connection (explicit `SET transaction_isolation TO 'strict serializable'`) to SELECT the rolling-count MV's row for the invocation's prefix. After a quiet-period closing observation, asserts (a) `always("…fresh-connection read regressed across adjacent observations", …)` for every adjacent pair, and (b) `always("…closing fresh-connection read regressed below earlier maximum", …)` for the closing read versus the historical max. One `sometimes("…final fresh-connection read reached inserted count", …)` liveness anchor. The SUT-side oracle-timestamp-non-decreasing mirror in `src/adapter/src/coord/in_memory_oracle.rs` is deferred. | | **Property** | Two reads on the same collection at timestamps t1 < t2 (assigned by the oracle) must observe consistent ordering: if t1 sees state S, t2 cannot observe a state prior to S. | | **Invariant** | `Always`: for any two reads where oracle assigns t1 < t2, the result at t2 must include all changes visible at t1. The oracle read timestamp must advance monotonically. | | **Antithesis Angle** | Run parallel transactions in StrictSerializable mode. One writes, another reads concurrently. Inject delays in oracle timestamp advancement. Antithesis explores whether reads can bypass the linearization point. | diff --git a/test/antithesis/workload/test/parallel_driver_strict_serializable_reads.py b/test/antithesis/workload/test/parallel_driver_strict_serializable_reads.py new file mode 100755 index 0000000000000..c4af73b434635 --- /dev/null +++ b/test/antithesis/workload/test/parallel_driver_strict_serializable_reads.py @@ -0,0 +1,225 @@ +#!/usr/bin/env python3 +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Antithesis driver for `strict-serializable-reads`. + +Materialize's headline consistency guarantee: two reads on the same +collection at oracle-assigned timestamps t1 < t2 must observe consistent +ordering — anything visible at t1 must remain visible at t2. This driver +exercises the cross-read half of that property: a sequence of fresh- +connection reads against a materialized view, interleaved with writes, +must yield a non-decreasing count. + +Approach: + 1. Reuse `helper_table_mv` (table `mv_input_table` + MV `mv_input_count`) + so this driver does not introduce new schema. Each invocation owns a + fresh prefix so concurrent driver instances scope to disjoint rows. + 2. For each step k = 1..N: + - INSERT one row tagged with the prefix in autocommit mode (each + insert is its own oracle-timestamped write). + - Open a *fresh* psycopg connection, set `transaction_isolation` + to `strict serializable` explicitly, and SELECT the MV's row + count for the prefix. Record (k, observed_count). + - Fresh connections are deliberate: a single long-lived connection + could mask a read-regression bug behind connection-local caching. + 3. After all steps, run one more fresh-connection SELECT as the final + observation. + 4. Assertions: + - `always(count[k+1] >= count[k], …)` between every adjacent pair + of recorded reads — the core strict-serializable read ordering + invariant. + - `always(final >= max(count), …)` for the closing observation. + - `sometimes(...)` liveness anchor confirming the closing + observation reached the inserted count after the quiet period. + +Read failures (connect timeout, server unavailable mid-fault) are skipped +rather than recorded — they are not regression evidence, and a False +positive on transient unavailability would obscure real bugs. + +This is a `parallel_driver_` — many concurrent instances run because the +property is about read monotonicity *within* each client's observation +stream, and prefix-scoping isolates each instance's expected count. +""" + +from __future__ import annotations + +import logging +import os +import sys +import time + +import helper_random +import psycopg +from helper_pg import ( + PGDATABASE, + PGHOST, + PGPORT, + PGUSER, + execute_retry, +) +from helper_quiet import request_quiet_period +from helper_table_mv import MV_NAME, TABLE_MV_INPUT, ensure_table_and_mv + +from antithesis.assertions import always, sometimes + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" +) +LOG = logging.getLogger("driver.strict_serializable_reads") + +STEPS_PER_INVOCATION = 12 +QUIET_PERIOD_S = 15 +FINAL_READ_TIMEOUT_S = 30.0 +FINAL_READ_POLL_S = 0.5 +PROBE_CONNECT_TIMEOUT_S = 5 + + +def _fresh_select_count(prefix: str) -> int | None: + """Open a *new* connection, force strict serializable, and SELECT the + MV's row_count for `prefix`. Returns None on any connect/query failure + so the caller can skip the observation without conflating fault-induced + unavailability with a read regression. + + Setting `transaction_isolation` explicitly costs one extra round trip + but defends against future changes to the system default. + """ + try: + with psycopg.connect( + host=PGHOST, + port=PGPORT, + user=PGUSER, + dbname=PGDATABASE, + connect_timeout=PROBE_CONNECT_TIMEOUT_S, + autocommit=True, + ) as conn, conn.cursor() as cur: + cur.execute("SET transaction_isolation TO 'strict serializable'") + cur.execute( + f"SELECT row_count::bigint FROM {MV_NAME} WHERE prefix = %s", + (prefix,), + ) + row = cur.fetchone() + except Exception: # noqa: BLE001 + return None + if row is None: + return 0 # MV has no row for this prefix yet + return int(row[0]) + + +def main() -> int: + ensure_table_and_mv() + + prefix = f"p{helper_random.random_u64():016x}" + LOG.info("strict-serializable driver starting; prefix=%s", prefix) + + # Sequence of (step_index, observed_count). Reads that failed are + # represented as None and dropped before assertions. + observations: list[tuple[int, int]] = [] + + for step in range(1, STEPS_PER_INVOCATION + 1): + # Each INSERT is one autocommit write; the coordinator stamps it + # with an oracle timestamp. We INSERT before the read so the + # *expected* monotone behaviour is that every read is >= the + # previous one and the final read equals the total insert count + # (modulo catchup; covered by the liveness anchor below). + try: + execute_retry( + f"INSERT INTO {TABLE_MV_INPUT} (id, prefix) VALUES (%s, %s)", + (step, prefix), + ) + except Exception as exc: # noqa: BLE001 + # Persistent insert failure under sustained fault — bail. + # Already-recorded observations are still valid evidence for + # the monotonicity assertion below. + LOG.info("step %d: insert failed (%s); ending step loop", step, exc) + break + + observed = _fresh_select_count(prefix) + if observed is None: + # Fault-window read; skip. We do NOT record it so the + # adjacent-pair assertion below doesn't see a spurious zero. + continue + observations.append((step, observed)) + + # Settle and take the closing observation. The driver is short and the + # observations list is small, so a generous timeout here is fine. + request_quiet_period(QUIET_PERIOD_S) + expected_final = len(observations) and observations[-1][0] + # `expected_final` is the largest step that was actually INSERTed (we + # may have bailed early). It's an *upper bound* on the count — the + # final count may equal it (fully caught up) or be slightly less + # (catchup still in flight). The monotonicity assertion only cares + # that final >= every earlier observation. + + deadline = time.monotonic() + FINAL_READ_TIMEOUT_S + final: int | None = _fresh_select_count(prefix) + while final is None and time.monotonic() < deadline: + time.sleep(FINAL_READ_POLL_S) + final = _fresh_select_count(prefix) + + sometimes( + final is not None and final == expected_final, + "strict-serializable reads: final fresh-connection read reached inserted count", + { + "prefix": prefix, + "expected_final": expected_final, + "final_observed": final, + "observations": len(observations), + }, + ) + + # ----- monotonicity: adjacent-pair assertion ----- + # Across the recorded fresh-connection reads, no read may regress. + # This is the strict-serializable read-ordering property. + for i in range(1, len(observations)): + prev_step, prev_count = observations[i - 1] + curr_step, curr_count = observations[i] + always( + curr_count >= prev_count, + "strict-serializable reads: fresh-connection read regressed across adjacent observations", + { + "prefix": prefix, + "prev_step": prev_step, + "prev_count": prev_count, + "curr_step": curr_step, + "curr_count": curr_count, + }, + ) + + # ----- monotonicity: closing observation dominates the maximum ----- + # If the closing observation succeeded, it must be >= every earlier + # observation. (The final equality with `expected_final` is covered by + # the `sometimes` liveness anchor above and is not asserted here.) + if final is not None and observations: + max_observed = max(c for _, c in observations) + always( + final >= max_observed, + "strict-serializable reads: closing fresh-connection read regressed below earlier maximum", + { + "prefix": prefix, + "final": final, + "max_earlier": max_observed, + }, + ) + + LOG.info( + "strict-serializable driver done; observations=%d final=%s expected_final=%s", + len(observations), + final, + expected_final, + ) + return 0 + + +if __name__ == "__main__": + # Touch the imported env constants so static analysis treats them as + # used; helper_pg re-exports them for drivers that bypass its retry + # helpers (as this one does for fresh connections). + _ = (PGHOST, PGPORT, PGUSER, PGDATABASE, os) + sys.exit(main()) From 06d90fbc6c53e0098671f44e8f244c3db773b962 Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Tue, 12 May 2026 00:40:20 -0400 Subject: [PATCH 27/65] =?UTF-8?q?test/antithesis:=20catalog=20cluster=20?= =?UTF-8?q?=E2=80=94=20partial=20epoch-fencing=20SUT=20anchor=20+=20catalo?= =?UTF-8?q?g-recovery-consistency=20workload=20driver?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Cargo.lock | 1 + src/catalog/Cargo.toml | 1 + src/catalog/src/durable/persist.rs | 48 ++++ .../scratchbook/property-catalog.md | 2 + ...ton_driver_catalog_recovery_consistency.py | 240 ++++++++++++++++++ 5 files changed, 292 insertions(+) create mode 100755 test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py diff --git a/Cargo.lock b/Cargo.lock index 78cfc5d7ecd59..10cec38aca6fe 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5815,6 +5815,7 @@ dependencies = [ name = "mz-catalog" version = "0.0.0" dependencies = [ + "antithesis_sdk", "anyhow", "async-trait", "base64 0.22.1", diff --git a/src/catalog/Cargo.toml b/src/catalog/Cargo.toml index 6704bd79d8b06..3553217de30ed 100644 --- a/src/catalog/Cargo.toml +++ b/src/catalog/Cargo.toml @@ -10,6 +10,7 @@ publish = false workspace = true [dependencies] +antithesis_sdk.workspace = true anyhow.workspace = true async-trait.workspace = true base64.workspace = true diff --git a/src/catalog/src/durable/persist.rs b/src/catalog/src/durable/persist.rs index c93830e38d7e3..83d560c98004c 100644 --- a/src/catalog/src/durable/persist.rs +++ b/src/catalog/src/durable/persist.rs @@ -17,6 +17,7 @@ use std::str::FromStr; use std::sync::{Arc, LazyLock}; use std::time::{Duration, Instant}; +use antithesis_sdk::assert_always_greater_than; use async_trait::async_trait; use differential_dataflow::lattice::Lattice; use futures::{FutureExt, StreamExt}; @@ -41,6 +42,7 @@ use mz_repr::Diff; use mz_storage_client::controller::PersistEpoch; use mz_storage_types::StorageDiff; use mz_storage_types::sources::SourceData; +use serde_json::json; use sha2::Digest; use timely::progress::{Antichain, Timestamp as TimelyTimestamp}; use tracing::{debug, info, warn}; @@ -145,6 +147,21 @@ impl FenceableToken { current_token, fence_token, } => { + // The two `assert!` calls below are the natural placement + // for an Antithesis `assert_always!` covering the + // FenceableToken state-machine invariant. They are not + // wrapped today because Materialize does not run multiple + // concurrent environmentd processes against the same + // catalog shard, so the `Fenced` state is unreachable in + // every supported topology — including the Antithesis + // topology in this repo. Wrapping them would create + // assertions Antithesis cannot exercise, which is dead + // weight in coverage reports. If we ever ship multi- + // environmentd (e.g. for a 0DT-preflight Antithesis run), + // convert these to `assert_always!` with distinct + // messages so a violation becomes a reportable property + // failure rather than a panic. See the + // `epoch-fencing-prevents-split-brain` catalog entry. assert!( fence_token > current_token, "must be fenced by higher token; current={current_token:?}, fence={fence_token:?}" @@ -1182,12 +1199,43 @@ impl UnopenedPersistCatalogState { "fencing previous catalogs" ); if matches!(self.mode, Mode::Writable) { + // Snapshot the prior durable epoch so the post-CaS anchor + // below can verify monotonicity. Captured before the write + // because `compare_and_append` may call `sync()` which + // reads new state into `self.fenceable_token`. + let prior_durable_epoch = self + .fenceable_token + .token() + .map(|t| t.epoch.get()) + .unwrap_or(0); match self .compare_and_append(fence_updates.clone(), commit_ts) .await { Ok(upper) => { commit_ts = upper; + // Antithesis anchor for `epoch-fencing-prevents- + // split-brain`: after our fence-token CaS commits, + // the freshly-minted epoch we just persisted must + // be strictly greater than the prior durable + // epoch. A regression here would mean a future + // lower-epoch writer would not be fenced out by + // the write we just made, opening the split-brain + // window the catalog is supposed to close. + let new_epoch = current_fenceable_token + .token() + .expect("freshly minted Unfenced token always has a current_token") + .epoch + .get(); + assert_always_greater_than!( + new_epoch, + prior_durable_epoch, + "catalog fencing: new durable epoch did not strictly increase after fence-token CaS", + &json!({ + "prior_durable_epoch": prior_durable_epoch, + "new_epoch": new_epoch, + }) + ); } Err(CompareAndAppendError::Fence(e)) => return Err(e.into()), Err(e @ CompareAndAppendError::UpperMismatch { .. }) => { diff --git a/test/antithesis/scratchbook/property-catalog.md b/test/antithesis/scratchbook/property-catalog.md index 746796228d668..93c1380929881 100644 --- a/test/antithesis/scratchbook/property-catalog.md +++ b/test/antithesis/scratchbook/property-catalog.md @@ -15,6 +15,7 @@ Properties that verify data correctness when crashes, network partitions, and co |---|---| | **Type** | Safety | | **Priority** | P0 — fundamental split-brain prevention; failure here corrupts all state | +| **Status** | **Partially implemented (SUT-side, single-coordinator scope)** — `src/catalog/src/durable/persist.rs`: an `assert_always_greater_than!(new_epoch, prior_durable_epoch, "catalog fencing: new durable epoch did not strictly increase after fence-token CaS", …)` fires after each successful fence-token CaS in `open_inner`. Every environmentd restart in the Antithesis topology exercises this path. **The cross-coordinator half of the property (a `Fenced` writer being correctly rejected at validate time) is NOT exercised today and is not planned.** Materialize does not run multiple concurrent environmentd processes against the same catalog shard in any supported topology, so the `FenceableToken::Fenced` state is unreachable here. The two `assert!` panics in `FenceableToken::validate` would be the natural Antithesis anchor for that half; they are intentionally left as bare panics with an in-source comment pointing back to this entry, to be promoted to `assert_always!` if a 0DT-preflight-style multi-environmentd topology is ever added. | | **Property** | After a coordinator restart with a higher epoch, the old coordinator (lower epoch) cannot successfully write to the catalog persist shard. | | **Invariant** | `Always`: once a higher epoch is written to consensus, any compare_and_append from a lower epoch must fail with FenceError. This is a strict safety invariant — every check must hold. | | **Antithesis Angle** | Network partition separates old coordinator from consensus while new coordinator starts with higher epoch. When partition heals, old coordinator's in-flight writes must be rejected. Antithesis explores the timing window between old coordinator's last successful write and new coordinator's first write. | @@ -87,6 +88,7 @@ Properties that verify Materialize's strict serializability guarantee and timest |---|---| | **Type** | Safety | | **Priority** | P1 — catalog corruption on recovery prevents system from starting | +| **Status** | **Implemented (workload-side)** — `test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py`. Long-running singleton driver holds an in-process `expected_tables` set across cycles. Each cycle runs one CREATE TABLE or DROP TABLE under `execute_retry`, then opens a *fresh* psycopg connection and SELECTs `mz_tables` filtered to the driver's namespace, asserting `always("catalog recovery: live catalog table set matches in-process expected model", …)`. Cross-cycle stability is exactly the recovery check: if an environmentd restart lands between cycles, the next cycle's read is the post-recovery snapshot. Two `sometimes(...)` anchors record (a) "2+ assertion cycles ran" so the post-restart half is exercised, and (b) "observed environmentd connect failure during run" as a corroborating signal that a fault actually landed. The SUT-side upper-non-regression mirror in `sync_to_current_upper` and the consolidation `assert_always!` are deferred. | | **Property** | After coordinator crash and restart, the recovered catalog state is equivalent to the pre-crash state: upper never decreases, snapshot is consolidated, and all committed transactions are visible. | | **Invariant** | `Always`: upper(post_restart) >= upper(pre_crash). After sync_to_current_upper(), the snapshot contains no unconsolidated entries (all diffs resolved). | | **Antithesis Angle** | Crash coordinator during catalog_transact (after some updates persist but before upper advances). Crash during consolidation. Antithesis explores the timing of crashes within the catalog write path. | diff --git a/test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py b/test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py new file mode 100755 index 0000000000000..59385a59a7ac7 --- /dev/null +++ b/test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py @@ -0,0 +1,240 @@ +#!/usr/bin/env python3 +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Antithesis driver for `catalog-recovery-consistency`. + +After environmentd crashes and restarts, the catalog state must be +consistent with what was committed pre-crash: every previously-acknowledged +DDL operation must remain visible, and the catalog upper must not regress. +The user-visible form of this property is: "if I created a table and +received an OK, the table is still there after a restart." + +Approach mirrors `singleton_driver_upsert_state_rehydration.py`: + - One `singleton_driver_` per timeline, long enough to span multiple + Antithesis-injected environmentd restarts. + - In-process `expected_tables: set[str]` model holds the authoritative + "what should be in the catalog right now" view. + - Per cycle, do some DDL (CREATE TABLE or DROP TABLE), then open a + *fresh* psycopg connection and SELECT from `mz_tables` scoped to the + driver's namespace, asserting the live catalog matches `expected`. + - Cross-cycle stability is the recovery check: if an environmentd + restart lands between cycle N and cycle N+1, cycle N+1's read is the + post-recovery snapshot and the assertion catches any lost or stuck + DDL. + +`helper_pg.execute_retry` retries OperationalError transparently, so when +environmentd is down mid-DDL the call will block-and-retry until the next +incarnation is reachable. That's exactly the timing we want: the DDL +either committed pre-crash (in which case it must reappear post-recovery) +or never committed (in which case we record it failed and update the +local model). When the retry budget elapses before recovery, we abandon +that cycle's DDL without updating the local model — fault windows +exceeding the budget are *not* property failures. + +Two corroborating `sometimes(...)` anchors record (a) whether the driver +observed a coord-side connect failure during its run, and (b) whether at +least two assertion-bearing cycles ran (so the assertion at cycle N+1 +genuinely reads post-restart state, not just the same state as N). +""" + +from __future__ import annotations + +import logging +import os +import sys +import time + +import helper_random +import psycopg +from helper_pg import ( + PGDATABASE, + PGHOST, + PGPORT, + PGUSER, + execute_retry, + query_retry, +) + +from antithesis.assertions import always, sometimes + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" +) +LOG = logging.getLogger("driver.catalog_recovery_consistency") + +# Long-running knobs: the driver owns its timeline and the per-cycle budget +# has to comfortably exceed environmentd's restart time so a fault landing +# mid-DDL still resolves before the next cycle. CYCLE_COUNT high enough to +# give Antithesis multiple windows to land a restart between cycles. +CYCLE_COUNT = 10 +DROP_PROBABILITY = 0.20 +INTER_CYCLE_SLEEP_S = 2.0 + +PROBE_CONNECT_TIMEOUT_S = 2.0 + + +def _fresh_observed_tables(name_prefix: str) -> set[str] | None: + """Open a new connection and SELECT mz_tables filtered to `name_prefix`. + + Returns the set of observed table names on success, or `None` on any + connect/query failure. None lets the caller skip the cycle's assertion + rather than blaming the property for a fault-window read. + """ + try: + with psycopg.connect( + host=PGHOST, + port=PGPORT, + user=PGUSER, + dbname=PGDATABASE, + connect_timeout=int(PROBE_CONNECT_TIMEOUT_S), + autocommit=True, + ) as conn, conn.cursor() as cur: + cur.execute( + "SELECT name FROM mz_tables WHERE name LIKE %s", + (f"{name_prefix}%",), + ) + return {row[0] for row in cur.fetchall()} + except Exception: # noqa: BLE001 + return None + + +def _saw_coord_unavailable() -> bool: + """Best-effort one-shot probe with the same short connect timeout as + the assertion reads. A failure here means a coord-side connection was + refused or timed out within the last ~tick — a strong proxy for + "environmentd is down or just restarted." This is corroborating signal + only; it does not gate the safety assertion. + """ + try: + with psycopg.connect( + host=PGHOST, + port=PGPORT, + user=PGUSER, + dbname=PGDATABASE, + connect_timeout=int(PROBE_CONNECT_TIMEOUT_S), + autocommit=True, + ) as _conn: + pass + return False + except Exception: # noqa: BLE001 + return True + + +def _run_cycle( + expected: set[str], + name_prefix: str, + cycle_idx: int, + next_id: int, +) -> tuple[bool, int]: + """One create-or-drop + verify cycle. + + Returns (assertions_ran, next_id_after) where `assertions_ran` is True + iff this cycle landed a successful post-DDL read against a fresh + connection (i.e. the cycle contributes to the safety property). The + `next_id` counter is monotonic across cycles so table names are unique + even after drops. + + The DDL is run via `execute_retry`, which already retries transient + OperationalError until the retry budget. If it raises anyway the + cycle aborts and the local model is not updated — exactly the + semantics needed: a DDL we never acknowledged is allowed to be + missing from the post-recovery catalog. + """ + new_id = next_id + if expected and helper_random.random_bool(DROP_PROBABILITY): + # Drop a random existing table. Choosing from `expected` keeps the + # drop deterministic w.r.t. the local model. + table = sorted(expected)[helper_random.random_int(0, len(expected) - 1)] + try: + execute_retry(f"DROP TABLE {table}") + except Exception as exc: # noqa: BLE001 + LOG.info("cycle %d: DROP %s failed (%s); not updating model", cycle_idx, table, exc) + return False, new_id + expected.discard(table) + else: + table = f"{name_prefix}_t{new_id:06d}" + try: + execute_retry(f"CREATE TABLE {table} (id BIGINT NOT NULL)") + except Exception as exc: # noqa: BLE001 + LOG.info("cycle %d: CREATE %s failed (%s); not updating model", cycle_idx, table, exc) + return False, new_id + expected.add(table) + new_id += 1 + + # Verify via a fresh connection. If this read fails, we skip the + # assertion — a fault-window read is not regression evidence. + observed = _fresh_observed_tables(name_prefix) + if observed is None: + LOG.info("cycle %d: fresh-connection read failed; skipping assertion", cycle_idx) + return False, new_id + + always( + observed == expected, + "catalog recovery: live catalog table set matches in-process expected model", + { + "cycle": cycle_idx, + "name_prefix": name_prefix, + "expected_count": len(expected), + "observed_count": len(observed), + # Cap the explicit diffs so the assertion details stay compact + # even on a large divergence. + "missing_from_catalog": sorted(expected - observed)[:5], + "unexpected_in_catalog": sorted(observed - expected)[:5], + }, + ) + return True, new_id + + +def main() -> int: + # Per-timeline namespace so concurrent timelines and any future + # parallel_driver_ instances do not collide on table names. + name_prefix = f"catrec_{helper_random.random_u64():016x}" + LOG.info("catalog recovery driver starting; name_prefix=%s", name_prefix) + + expected: set[str] = set() + next_id = 0 + cycles_ran = 0 + saw_coord_unavailable = False + + for cycle_idx in range(CYCLE_COUNT): + ran, next_id = _run_cycle(expected, name_prefix, cycle_idx, next_id) + if ran: + cycles_ran += 1 + if _saw_coord_unavailable(): + saw_coord_unavailable = True + time.sleep(INTER_CYCLE_SLEEP_S) + + sometimes( + cycles_ran >= 2, + "catalog recovery: 2+ assertion-bearing cycles ran in this timeline", + {"cycles_ran": cycles_ran, "cycles_planned": CYCLE_COUNT}, + ) + sometimes( + saw_coord_unavailable, + "catalog recovery: observed environmentd connect failure during run", + {"cycles_ran": cycles_ran, "saw_coord_unavailable": saw_coord_unavailable}, + ) + + LOG.info( + "catalog recovery driver done; cycles_ran=%d/%d expected_size=%d saw_coord_unavailable=%s", + cycles_ran, + CYCLE_COUNT, + len(expected), + saw_coord_unavailable, + ) + return 0 + + +if __name__ == "__main__": + # Touch helper_pg env constants so static analysis treats them as + # used; the helper module re-exports them for drivers (like this one) + # that open their own connections. + _ = (PGHOST, PGPORT, PGUSER, PGDATABASE, os, query_retry) + sys.exit(main()) From 3b9bac51e01d8481f1dcc115f1a96da264d24eed Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Tue, 12 May 2026 00:54:58 -0400 Subject: [PATCH 28/65] test/antithesis: drop unfireable rehydration anchor; bump pg client timeouts; remove dead upsert.rs (classic) antithesis asserts --- src/storage/src/upsert.rs | 15 ------ .../scratchbook/property-catalog.md | 4 +- test/antithesis/workload/test/helper_pg.py | 12 ++++- ...ngleton_driver_upsert_state_rehydration.py | 53 +++++++------------ 4 files changed, 30 insertions(+), 54 deletions(-) diff --git a/src/storage/src/upsert.rs b/src/storage/src/upsert.rs index 5c8922de4c022..cdc583d76b119 100644 --- a/src/storage/src/upsert.rs +++ b/src/storage/src/upsert.rs @@ -15,7 +15,6 @@ use std::hash::{Hash, Hasher}; use std::path::PathBuf; use std::sync::Arc; -use antithesis_sdk::{assert_always, assert_unreachable}; use differential_dataflow::hashable::Hashable; use differential_dataflow::{AsCollection, VecCollection}; use futures::StreamExt; @@ -35,7 +34,6 @@ use mz_timely_util::builder_async::{ PressOnDropButton, }; use serde::{Deserialize, Serialize}; -use serde_json::json; use sha2::{Digest, Sha256}; use timely::dataflow::channels::pact::Exchange; use timely::dataflow::operators::{Capability, InputCapability, Operator}; @@ -540,11 +538,6 @@ fn stage_input( } stash.extend(data.drain(..).map(|((key, value, order), time, diff)| { - assert_always!( - diff.is_positive(), - "upsert: input diff positive (classic)", - &json!({"diff": diff.into_inner()}) - ); assert!(diff.is_positive(), "invalid upsert input"); (time, key, Reverse(order), value) })); @@ -640,10 +633,6 @@ async fn drain_staged_input( let mut command_state = if let Entry::Occupied(command_state) = commands_state.entry(key) { command_state } else { - assert_unreachable!( - "upsert: key missing from commands_state (classic)", - &json!({"source_id": source_config.id.to_string()}) - ); panic!("key missing from commands_state"); }; @@ -1039,9 +1028,5 @@ async fn process_upsert_state_error( let update = HealthStatusUpdate::halting(e.context(context).to_string_with_causes(), None); health_output.give(health_cap, (None, update)); std::future::pending::<()>().await; - assert_unreachable!( - "upsert: pending future returned (classic)", - &json!({"site": "process_upsert_state_error"}) - ); unreachable!("pending future never returns"); } diff --git a/test/antithesis/scratchbook/property-catalog.md b/test/antithesis/scratchbook/property-catalog.md index 93c1380929881..b09ceb0a00e7c 100644 --- a/test/antithesis/scratchbook/property-catalog.md +++ b/test/antithesis/scratchbook/property-catalog.md @@ -339,8 +339,8 @@ Properties specific to the Kafka source ingestion pipeline: `KafkaSourceReader` |---|---| | **Type** | Reachability (Unreachable) | | **Priority** | P1 — these panics are explicit "should-never-happen" guards that bug history has hit | -| **Status** | **Implemented (SUT-side)** — every targeted site in `src/storage/src/upsert.rs` (stash diff-positive, `commands_state` missing key, `process_upsert_state_error` pending-future guard), `upsert_continual_feedback.rs` (stash diff-positive, `commands_state` missing key), `upsert_continual_feedback_v2.rs` (input diff-positive, `(None, None)` join), and `upsert/types.rs` (`merge_update_state` non-Consolidating, double snapshot completion) gets a uniquely-messaged `assert_always!`/`assert_unreachable!` paired with the original `panic!`/`assert!`. Panics still terminate the process; Antithesis now also receives a reportable property failure with rich details. | -| **Property** | The explicit panics and `assert!`s in the upsert operator never fire under any Antithesis-injected fault sequence. Specifically: `assert!(diff.is_positive(), "invalid upsert input")` (upsert.rs:541, upsert_continual_feedback.rs:626, v2:315); `panic!("key missing from commands_state")` (upsert.rs:636, upsert_continual_feedback.rs:800); `unreachable!()` for `(None, None)` in continual-feedback v2 (v2:483); the order-key panic that used to live in `drain_staged_input` (now a skip; commit f177db8286). | +| **Status** | **Implemented (SUT-side, reachable sites only)** — every targeted *reachable* site has a uniquely-messaged `assert_always!`/`assert_unreachable!` paired with the original `panic!`/`assert!`: `upsert_continual_feedback.rs` (stash diff-positive, `commands_state` missing key), `upsert_continual_feedback_v2.rs` (input diff-positive, `(None, None)` join), and `upsert/types.rs` (`merge_update_state` non-Consolidating, double snapshot completion). The mirror sites in `src/storage/src/upsert.rs` (classic) were dropped: `upsert_operator` hard-codes `use_continual_feedback_upsert = true` (commit a63d1763e5, Feb 2025), so the classic-upsert code is provably unreachable in supported configurations and Antithesis-instrumenting it added dead-weight assertions. Panics still terminate the process; Antithesis receives a reportable property failure with rich details for every reachable site. | +| **Property** | The explicit panics and `assert!`s in the upsert operator never fire under any Antithesis-injected fault sequence. Specifically (reachable sites): `assert!(diff.is_positive(), "invalid upsert input")` (upsert_continual_feedback.rs:626, v2:315); `panic!("key missing from commands_state")` (upsert_continual_feedback.rs:800); `unreachable!()` for `(None, None)` in continual-feedback v2 (v2:483); the order-key panic that used to live in `drain_staged_input` (now a skip; commit f177db8286). | | **Invariant** | `Unreachable`: each of these sites is converted to an Antithesis `assert_unreachable!("…")` (or `assert_always!(false, …)`) so that any firing produces an explicit Antithesis property failure rather than a process crash. Distinct, unique message per site. | | **Antithesis Angle** | These are the high-signal SUT-side anchors. They catch the same family of bugs that historically reached production: order-key regression, missing dedup entry, retraction-on-input. Adding them costs almost nothing in the SUT and gives Antithesis precise replay anchors. | | **Why It Matters** | These panics indicate the operator entered an internal state its author thought was impossible. Past bugs (commits f177db8286, 1accbe28b3) reached production exactly through these paths. The asserts already exist; we just need to wrap them with the Antithesis SDK so the failures become reportable properties rather than process kills. | diff --git a/test/antithesis/workload/test/helper_pg.py b/test/antithesis/workload/test/helper_pg.py index d336905b1914b..5c74276fe5f90 100644 --- a/test/antithesis/workload/test/helper_pg.py +++ b/test/antithesis/workload/test/helper_pg.py @@ -37,8 +37,16 @@ # Retry tuning. Antithesis injects partitions and node hangs; conservative bounds # keep drivers progressing without masking real correctness signals. -_CONNECT_TIMEOUT_S = 5 -_RETRY_BUDGET_S = 60 +# +# These need to absorb a full Antithesis quiet period plus restart time for the +# system to come back. Quiet-period requests in the workload are typically +# 20-25s; the container then takes a few seconds to become responsive, so the +# overall budget must comfortably exceed ~30s. The per-attempt connect timeout +# also has to be long enough to actually complete a TCP+TLS handshake against +# a hung but recovering materialized — too short and every attempt fails fast +# and the budget is burned without giving the system a chance to answer. +_CONNECT_TIMEOUT_S = 15 +_RETRY_BUDGET_S = 120 _RETRY_INITIAL_S = 0.1 _RETRY_MAX_S = 2.0 diff --git a/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py b/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py index 5c41c406f3210..5f3c13bcdce57 100755 --- a/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py +++ b/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py @@ -34,10 +34,19 @@ The driver also records one `sometimes` anchor confirming that at least two assertion-bearing cycles ran (without this, the safety check could be -vacuously satisfied by a single early settle), and a second anchor -confirming clusterd was observed unavailable between cycles (best-effort -proxy for "restart happened" — the helper_pg retry budget makes connect -errors very rare under normal operation). +vacuously satisfied by a single early settle). + +A previous version of this driver also recorded a "clusterd observed +non-online" `sometimes` anchor via a once-per-cycle SELECT of +`mz_internal.mz_cluster_replica_statuses`. That assertion was structurally +unable to fire here: each cycle requests a 25-second Antithesis quiet +period before its assertions, the probe runs *after* the quiet period +(when faults are paused and killed containers have been restored), and +the introspection view itself lags clusterd death by the +orchestrator-process 5-second poll. The "did we see a replica go +offline" signal lives in `anytime_fault_recovery_exercised.py` instead, +which polls continuously and never requests a quiet period, so it has +the right shape to observe the offline window. Distinct prefix per timeline keeps multiple parallel timelines independent. """ @@ -102,29 +111,6 @@ def _select_value_for_key(key: str) -> tuple[bool, str | None]: return True, value -def _saw_clusterd_unavailable() -> bool: - """Best-effort probe: does `mz_internal.mz_cluster_replica_statuses` show - any `antithesis_cluster` replica with `status != 'online'` right now? - The status column reports `online` or `offline`. Catching `offline` - in a snapshot doesn't *prove* a restart happened (we may have missed - a transient flap entirely), but it's a noisy yes-signal that something - disturbed the cluster during the cycle. - """ - try: - row = query_one_retry(""" - SELECT EXISTS ( - SELECT 1 - FROM mz_internal.mz_cluster_replica_statuses s - JOIN mz_cluster_replicas r ON r.id = s.replica_id - JOIN mz_clusters c ON c.id = r.cluster_id - WHERE c.name = 'antithesis_cluster' AND s.status != 'online' - ) - """) - except Exception: # noqa: BLE001 - return False - return bool(row and row[0]) - - def _run_cycle( producer, tracker, expected: dict[str, str | None], cycle_idx: int ) -> bool: @@ -220,25 +206,22 @@ def main() -> int: expected: dict[str, str | None] = {} cycles_run = 0 - saw_replica_unavailable = False for cycle_idx in range(CYCLE_COUNT): if _run_cycle(producer, tracker, expected, cycle_idx): cycles_run += 1 - if _saw_clusterd_unavailable(): - saw_replica_unavailable = True time.sleep(INTER_CYCLE_SLEEP_S) + # The "did this run actually span a clusterd restart" anchor is + # deliberately not in this driver — see the module docstring. The + # `cycles_run >= 2` check below is the rehydration-coverage anchor: + # without two post-quiet-period reads, the safety assertions could + # be vacuously satisfied by a single early settle. sometimes( cycles_run >= 2, "upsert: rehydration driver ran 2+ assertion cycles", {"cycles_run": cycles_run, "cycles_planned": CYCLE_COUNT}, ) - sometimes( - saw_replica_unavailable, - "upsert: rehydration driver observed clusterd replica non-online", - {"cycles_run": cycles_run}, - ) LOG.info("rehydration driver done; %d/%d cycles ran", cycles_run, CYCLE_COUNT) return 0 From 4366c9e74eef6a1609ccad00fa0de879635d380d Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Tue, 12 May 2026 01:06:24 -0400 Subject: [PATCH 29/65] test/antithesis: add second clusterd replica to antithesis_cluster for multi-replica fault coverage --- test/antithesis/config/docker-compose.yaml | 41 +++++++++++++++++++ test/antithesis/mzcompose.py | 16 +++++++- .../workload/workload-entrypoint.sh | 39 ++++++++++++------ 3 files changed, 82 insertions(+), 14 deletions(-) diff --git a/test/antithesis/config/docker-compose.yaml b/test/antithesis/config/docker-compose.yaml index 73291200a043c..05d5e1cc27297 100644 --- a/test/antithesis/config/docker-compose.yaml +++ b/test/antithesis/config/docker-compose.yaml @@ -210,6 +210,45 @@ services: stop_grace_period: 120s platform: linux/amd64 image: ${MATERIALIZED_IMAGE} + clusterd2: + entrypoint: + - tini + - -- + command: + - clusterd + - --scratch-directory=/scratch + ports: + - 2100 + - 2101 + - 6878 + environment: + - CLUSTERD_GRPC_HOST=clusterd2 + - CLUSTERD_USE_CTP=true + - MZ_SOFT_ASSERTIONS=1 + - CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2100 + - CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2101 + - CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=0.0.0.0:6878 + - CLUSTERD_SECRETS_READER=local-file + - CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=/mzdata/secrets + - LD_PRELOAD=libeatmydata.so + - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879 + - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0 + - CLUSTERD_PROCESS=0 + - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 1, "process": 0, "addresses": ["clusterd2:2102"], + "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc": + false, "zero_copy_limit": null}' + - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 1, "process": 0, "addresses": ["clusterd2:2103"], + "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc": + false, "zero_copy_limit": null}' + volumes: + - mzdata:/mzdata + - mydata:/var/lib/mysql-files + - tmp:/share/tmp + - scratch:/scratch + restart: 'no' + stop_grace_period: 120s + platform: linux/amd64 + image: ${MATERIALIZED_IMAGE} materialized: hostname: materialized depends_on: @@ -404,6 +443,8 @@ services: condition: service_healthy clusterd1: condition: service_started + clusterd2: + condition: service_started kafka: condition: service_healthy schema-registry: diff --git a/test/antithesis/mzcompose.py b/test/antithesis/mzcompose.py index 552dd1d21e824..d4b31841da46f 100644 --- a/test/antithesis/mzcompose.py +++ b/test/antithesis/mzcompose.py @@ -15,8 +15,11 @@ - minio : S3-compatible blob storage for persist - zookeeper + kafka : Kafka broker for source ingestion - schema-registry : Avro/Protobuf schemas for kafka sources - - clusterd1 : external compute+storage process — fenceable - independently of materialized for fault testing + - clusterd1, clusterd2 : two external compute+storage processes — each + backs one replica of `antithesis_cluster`, so + Antithesis killing either container exercises the + compute/storage-replica recovery and rebalancing + paths without taking the cluster offline. - materialized : the SUT (environmentd; clusterd is external) - workload : Python test driver wired to the Antithesis SDK @@ -45,6 +48,7 @@ def __init__(self) -> None: "depends_on": { "materialized": {"condition": "service_healthy"}, "clusterd1": {"condition": "service_started"}, + "clusterd2": {"condition": "service_started"}, "kafka": {"condition": "service_healthy"}, "schema-registry": {"condition": "service_started"}, }, @@ -71,7 +75,14 @@ def __init__(self) -> None: Zookeeper(), Kafka(auto_create_topics=True), SchemaRegistry(), + # Two clusterd processes, one per replica of the unmanaged + # `antithesis_cluster`. Provisioning both replicas in the same cluster + # exercises multi-replica source ingestion and compute paths + # (notably the `compute-replica-epoch-isolation` property), and lets + # Antithesis kill either replica's backing container without taking + # the workload offline. Clusterd(name="clusterd1"), + Clusterd(name="clusterd2"), Materialized( external_blob_store=True, external_metadata_store=True, @@ -99,6 +110,7 @@ def workflow_default(c: Composition) -> None: "kafka", "schema-registry", "clusterd1", + "clusterd2", ) c.up("materialized") c.up("workload") diff --git a/test/antithesis/workload/workload-entrypoint.sh b/test/antithesis/workload/workload-entrypoint.sh index e660a7904bb46..ce6e664a2c0de 100755 --- a/test/antithesis/workload/workload-entrypoint.sh +++ b/test/antithesis/workload/workload-entrypoint.sh @@ -25,24 +25,39 @@ until curl -sf http://materialized:6878/api/readyz > /dev/null 2>&1; do done echo "materialized is healthy." -# Provision an unmanaged cluster backed by the external clusterd1 process. -# This must run before setup-complete so Test Composer assertions can target -# the cluster from the start. Idempotent — `IF NOT EXISTS` is unsupported on -# `CREATE CLUSTER REPLICAS (...)`, so we query mz_clusters first. +# Provision an unmanaged cluster with one replica per external clusterd +# process. Multi-replica gives Antithesis the option to kill one +# clusterd at a time without taking the workload offline, and exercises +# the multi-replica compute/storage code paths (notably +# `compute-replica-epoch-isolation`). +# +# This must run before setup-complete so Test Composer assertions can +# target the cluster from the start. Idempotent — `IF NOT EXISTS` is +# unsupported on `CREATE CLUSTER REPLICAS (...)`, so we query +# mz_clusters first. existing=$( psql -h "$PGHOST" -p "$PGPORT_INTERNAL" -U "$PGUSER_INTERNAL" -tAc \ "SELECT 1 FROM mz_clusters WHERE name = '$CLUSTER'" ) if [[ -z "$existing" ]]; then - echo "Provisioning cluster '$CLUSTER' against clusterd1..." + echo "Provisioning cluster '$CLUSTER' with replicas on clusterd1 + clusterd2..." psql -h "$PGHOST" -p "$PGPORT_INTERNAL" -U "$PGUSER_INTERNAL" < Date: Tue, 12 May 2026 11:30:10 -0400 Subject: [PATCH 30/65] test/antithesis: per-clusterd scratch volume so two replicas don't share RocksDB lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When I added clusterd2 in 4366c9e7, both clusterds inherited the DEFAULT_MZ_VOLUMES list, which uses a single named volume scratch:/scratch. Docker named volumes are shared across containers by name, so the two clusterds mounted the same /scratch and contended for RocksDB locks at /scratch/storage/upsert///LOCK. This wedged clusterd1: it could never open its upsert RocksDB ("Resource temporarily unavailable" on the LOCK file), entered Stalled health with "Failed to rehydrate state", broadcast suspend-and-restart, and looped retry-fail-suspend-restart for the entire run. The continuous restart loop drove the upsert feedback-driven snapshot replay path in ways that produced visibly wrong durable state for the source — exactly the upsert-state-rehydrates-correctly assertions caught in the 2026-05-12 05:39 UTC Antithesis report. Fix: give each clusterd its own per-instance named volume for /scratch. The other volumes stay shared because they don't take exclusive locks. Also patch export-compose.py to auto-declare any service-referenced named volume at the top level — Composition only auto-declares DEFAULT_MZ_VOLUMES, so without this the custom names broke `docker compose config`. --- test/antithesis/config/docker-compose.yaml | 6 +++-- test/antithesis/export-compose.py | 27 +++++++++++++++++++ test/antithesis/mzcompose.py | 31 ++++++++++++++++++++-- 3 files changed, 60 insertions(+), 4 deletions(-) diff --git a/test/antithesis/config/docker-compose.yaml b/test/antithesis/config/docker-compose.yaml index 05d5e1cc27297..65fcc7f447fd9 100644 --- a/test/antithesis/config/docker-compose.yaml +++ b/test/antithesis/config/docker-compose.yaml @@ -205,7 +205,7 @@ services: - mzdata:/mzdata - mydata:/var/lib/mysql-files - tmp:/share/tmp - - scratch:/scratch + - clusterd1_scratch:/scratch restart: 'no' stop_grace_period: 120s platform: linux/amd64 @@ -244,7 +244,7 @@ services: - mzdata:/mzdata - mydata:/var/lib/mysql-files - tmp:/share/tmp - - scratch:/scratch + - clusterd2_scratch:/scratch restart: 'no' stop_grace_period: 120s platform: linux/amd64 @@ -475,3 +475,5 @@ volumes: tmp: null secrets: null scratch: null + clusterd1_scratch: null + clusterd2_scratch: null diff --git a/test/antithesis/export-compose.py b/test/antithesis/export-compose.py index 4e1fb5bece519..a204a76fdbf87 100644 --- a/test/antithesis/export-compose.py +++ b/test/antithesis/export-compose.py @@ -189,6 +189,31 @@ def strip_mzcompose_keys(svc: dict[str, Any]) -> None: svc.pop(key, None) +def register_referenced_named_volumes(compose: dict[str, Any]) -> None: + """Declare any named volume referenced by a service that isn't already + declared at the top level. Docker Compose rejects the file otherwise. + + mzcompose's `Composition` only auto-declares the fixed `DEFAULT_MZ_VOLUMES` + set; per-service custom named volumes (e.g. `clusterd1_scratch`) reference + names that have no top-level entry and fail `docker compose config`. + """ + top_level: dict[str, Any] = compose.setdefault("volumes", {}) or {} + compose["volumes"] = top_level + + for svc in compose.get("services", {}).values(): + for entry in svc.get("volumes", []) or []: + if not isinstance(entry, str): + continue + # Bind mounts (`/host:/container`) start with `/`; named volumes + # are bare identifiers. We only auto-declare the latter. + if entry.startswith("/"): + continue + name = entry.split(":", 1)[0] + if not name or name in top_level: + continue + top_level[name] = None + + def main() -> None: # munge_services=False keeps ports bare (e.g., `6875` instead of # `127.0.0.1::6875`) — Antithesis is container-to-container, no host @@ -207,6 +232,8 @@ def main() -> None: strip_incompatible_env(svc) strip_mzcompose_keys(svc) + register_referenced_named_volumes(c.compose) + sys.stdout.write(HEADER) yaml.dump(c.compose, sys.stdout, default_flow_style=False, sort_keys=False) diff --git a/test/antithesis/mzcompose.py b/test/antithesis/mzcompose.py index d4b31841da46f..eb30696a43cf0 100644 --- a/test/antithesis/mzcompose.py +++ b/test/antithesis/mzcompose.py @@ -81,8 +81,35 @@ def __init__(self) -> None: # (notably the `compute-replica-epoch-isolation` property), and lets # Antithesis kill either replica's backing container without taking # the workload offline. - Clusterd(name="clusterd1"), - Clusterd(name="clusterd2"), + # + # Each clusterd MUST have its own /scratch volume — the upsert + # operator's RocksDB state lives there and takes an exclusive file + # lock. The DEFAULT_MZ_VOLUMES list uses a single named volume + # `scratch:/scratch` shared across containers; passing per-instance + # named volumes (`clusterd1_scratch`, `clusterd2_scratch`) keeps the + # locks separate while leaving the other volumes shared. Found via + # an Antithesis run where clusterd1 deadlocked retrying to open + # `/scratch/storage/upsert/u3/0/LOCK` because clusterd2 held it, + # which then drove a continuous suspend-and-restart loop that + # corrupted the upsert state. + Clusterd( + name="clusterd1", + volumes=[ + "mzdata:/mzdata", + "mydata:/var/lib/mysql-files", + "tmp:/share/tmp", + "clusterd1_scratch:/scratch", + ], + ), + Clusterd( + name="clusterd2", + volumes=[ + "mzdata:/mzdata", + "mydata:/var/lib/mysql-files", + "tmp:/share/tmp", + "clusterd2_scratch:/scratch", + ], + ), Materialized( external_blob_store=True, external_metadata_store=True, From e98f3dc2bfd38f4c9367b6acfaae6ad98af4f155 Mon Sep 17 00:00:00 2001 From: Patrick Butler Date: Tue, 12 May 2026 14:13:58 -0400 Subject: [PATCH 31/65] test/antithesis: add workload for mysql multithreaded replication chain --- test/antithesis/config/.env | 24 +- test/antithesis/config/docker-compose.yaml | 75 ++++++ test/antithesis/mzcompose.py | 36 +++ .../properties/mysql-source-no-data-loss.md | 120 ++++++++++ .../scratchbook/property-catalog.md | 20 ++ test/antithesis/workload/Dockerfile | 3 +- .../test/first_mysql_replica_setup.py | 159 +++++++++++++ test/antithesis/workload/test/helper_mysql.py | 159 +++++++++++++ .../workload/test/helper_mysql_source.py | 97 ++++++++ .../test/parallel_driver_mysql_cdc.py | 223 ++++++++++++++++++ 10 files changed, 895 insertions(+), 21 deletions(-) create mode 100644 test/antithesis/scratchbook/properties/mysql-source-no-data-loss.md create mode 100644 test/antithesis/workload/test/first_mysql_replica_setup.py create mode 100644 test/antithesis/workload/test/helper_mysql.py create mode 100644 test/antithesis/workload/test/helper_mysql_source.py create mode 100644 test/antithesis/workload/test/parallel_driver_mysql_cdc.py diff --git a/test/antithesis/config/.env b/test/antithesis/config/.env index d4f160a98596f..92cddafe58f23 100644 --- a/test/antithesis/config/.env +++ b/test/antithesis/config/.env @@ -1,21 +1,5 @@ -# Copyright Materialize, Inc. and contributors. All rights reserved. -# -# Use of this software is governed by the Business Source License -# included in the LICENSE file at the root of this repository. -# -# As of the Change Date specified in that file, in accordance with -# the Business Source License, use of this software will be governed -# by the Apache License, Version 2.0. - -# Compose env-file for `test/antithesis/config/docker-compose.yaml`. -# Tracked by git only so that the file exists for mzbuild's input -# fingerprinting and survives `git clean -ffdX` between builds. The -# committed values are placeholders — `build-antithesis.sh` overwrites -# them in CI with refs to images pushed to Antithesis's GCP Artifact -# Registry, and `make export-env` does the same with local-dev refs. -# -# If you see these placeholder values on a running cluster, your build -# pipeline did not regenerate this file. Run: +# GENERATED FILE — do not edit. Regenerate via: # bin/pyactivate test/antithesis/export-env.py > test/antithesis/config/.env -MATERIALIZED_IMAGE=placeholder-not-built -ANTITHESIS_WORKLOAD_IMAGE=placeholder-not-built +# Consumed by test/antithesis/config/docker-compose.yaml at compose-parse time. +MATERIALIZED_IMAGE=ghcr.io/materializeinc/materialize/materialized:mzbuild-EMRA5ARAVQMKNFJIHZJTAPOEWMAGW5TX +ANTITHESIS_WORKLOAD_IMAGE=ghcr.io/materializeinc/materialize/antithesis-workload:mzbuild-YKN4ZHJT7YAPYQLQV5BYTUPCRY2RUDRI diff --git a/test/antithesis/config/docker-compose.yaml b/test/antithesis/config/docker-compose.yaml index 65fcc7f447fd9..97d982367e63b 100644 --- a/test/antithesis/config/docker-compose.yaml +++ b/test/antithesis/config/docker-compose.yaml @@ -171,6 +171,71 @@ services: interval: 1s start_period: 120s platform: linux/amd64 + mysql: + init: true + ports: + - 3306 + environment: + - MYSQL_ROOT_PASSWORD=p@ssw0rd + command: + - --secure-file-priv=/var/lib/mysql-files + - --log-bin=mysql-bin + - --gtid_mode=ON + - --enforce_gtid_consistency=ON + - --binlog-format=row + - --binlog-row-image=full + - --binlog-row-metadata=full + - --server-id=1 + - --max-connections=500 + - --binlog_transaction_dependency_tracking=WRITESET + healthcheck: + test: + - CMD + - mysqladmin + - ping + - --password=p@ssw0rd + - --protocol=TCP + interval: 1s + start_period: 180s + volumes: + - mysqldata_primary:/var/lib/mysql + - mydata:/var/lib/mysql-files + image: mysql:9.5.0 + platform: linux/amd64 + mysql-replica: + init: true + ports: + - 3306 + environment: + - MYSQL_ROOT_PASSWORD=p@ssw0rd + command: + - --secure-file-priv=/var/lib/mysql-files + - --log-bin=mysql-bin + - --gtid_mode=ON + - --enforce_gtid_consistency=ON + - --binlog-format=row + - --binlog-row-image=full + - --binlog-row-metadata=full + - --server-id=2 + - --max-connections=500 + - --log-slave-updates + - --skip-replica-start + - --replica_parallel_workers=4 + - --replica_preserve_commit_order=ON + healthcheck: + test: + - CMD + - mysqladmin + - ping + - --password=p@ssw0rd + - --protocol=TCP + interval: 1s + start_period: 180s + volumes: + - mysqldata_replica:/var/lib/mysql + - mydata:/var/lib/mysql-files + image: mysql:9.5.0 + platform: linux/amd64 clusterd1: entrypoint: - tini @@ -419,6 +484,7 @@ services: - MZ_NO_BUILTIN_POSTGRES=1 - MZ_NO_BUILTIN_COCKROACH=1 - MZ_ADAPTER_STASH_URL=postgres://root@postgres-metadata:26257?options=--search_path=adapter + - MZ_LICENSE_KEY=/license_key/license_key volumes: - mzdata:/mzdata - mydata:/var/lib/mysql-files @@ -449,6 +515,10 @@ services: condition: service_healthy schema-registry: condition: service_started + mysql: + condition: service_healthy + mysql-replica: + condition: service_healthy environment: - PGHOST=materialized - PGPORT=6875 @@ -458,6 +528,9 @@ services: - KAFKA_BROKER=kafka:9092 - SCHEMA_REGISTRY_URL=http://schema-registry:8081 - MZ_ANTITHESIS_CLUSTER=antithesis_cluster + - MYSQL_HOST=mysql + - MYSQL_REPLICA_HOST=mysql-replica + - MYSQL_PASSWORD=p@ssw0rd platform: linux/amd64 image: ${ANTITHESIS_WORKLOAD_IMAGE} networks: {} @@ -475,5 +548,7 @@ volumes: tmp: null secrets: null scratch: null + mysqldata_primary: null + mysqldata_replica: null clusterd1_scratch: null clusterd2_scratch: null diff --git a/test/antithesis/mzcompose.py b/test/antithesis/mzcompose.py index eb30696a43cf0..c799269cbd216 100644 --- a/test/antithesis/mzcompose.py +++ b/test/antithesis/mzcompose.py @@ -34,6 +34,7 @@ from materialize.mzcompose.services.kafka import Kafka from materialize.mzcompose.services.materialized import Materialized from materialize.mzcompose.services.minio import Minio +from materialize.mzcompose.services.mysql import MySql, create_mysql_server_args from materialize.mzcompose.services.postgres import PostgresMetadata from materialize.mzcompose.services.schema_registry import SchemaRegistry from materialize.mzcompose.services.zookeeper import Zookeeper @@ -51,6 +52,8 @@ def __init__(self) -> None: "clusterd2": {"condition": "service_started"}, "kafka": {"condition": "service_healthy"}, "schema-registry": {"condition": "service_started"}, + "mysql": {"condition": "service_healthy"}, + "mysql-replica": {"condition": "service_healthy"}, }, "environment": [ "PGHOST=materialized", @@ -64,6 +67,10 @@ def __init__(self) -> None: # Name of the unmanaged cluster the workload-entrypoint # provisions against clusterd1 before emitting setup-complete. "MZ_ANTITHESIS_CLUSTER=antithesis_cluster", + # MySQL primary and replica connection details. + "MYSQL_HOST=mysql", + "MYSQL_REPLICA_HOST=mysql-replica", + f"MYSQL_PASSWORD={MySql.DEFAULT_ROOT_PASSWORD}", ], } super().__init__(name="workload", config=config) @@ -75,6 +82,33 @@ def __init__(self) -> None: Zookeeper(), Kafka(auto_create_topics=True), SchemaRegistry(), + # MySQL primary — GTID-enabled with WRITESET dependency tracking so the + # replica can safely use parallel workers without losing commit order. + MySql( + use_seeded_image=False, + volumes=[ + "mysqldata_primary:/var/lib/mysql", + "mydata:/var/lib/mysql-files", + ], + additional_args=create_mysql_server_args(server_id="1", is_master=True) + + ["--binlog_transaction_dependency_tracking=WRITESET"], + ), + # MySQL replica — multithreaded replication (4 workers, commit-order + # preserved). Replication is configured at runtime by + # first_mysql_replica_setup.py after both containers are healthy. + MySql( + name="mysql-replica", + use_seeded_image=False, + volumes=[ + "mysqldata_replica:/var/lib/mysql", + "mydata:/var/lib/mysql-files", + ], + additional_args=create_mysql_server_args(server_id="2", is_master=False) + + [ + "--replica_parallel_workers=4", + "--replica_preserve_commit_order=ON", + ], + ), # Two clusterd processes, one per replica of the unmanaged # `antithesis_cluster`. Provisioning both replicas in the same cluster # exercises multi-replica source ingestion and compute paths @@ -138,6 +172,8 @@ def workflow_default(c: Composition) -> None: "schema-registry", "clusterd1", "clusterd2", + "mysql", + "mysql-replica", ) c.up("materialized") c.up("workload") diff --git a/test/antithesis/scratchbook/properties/mysql-source-no-data-loss.md b/test/antithesis/scratchbook/properties/mysql-source-no-data-loss.md new file mode 100644 index 0000000000000..19f6d02d68974 --- /dev/null +++ b/test/antithesis/scratchbook/properties/mysql-source-no-data-loss.md @@ -0,0 +1,120 @@ +# mysql-source-no-data-loss — Every Row Written to MySQL Primary Is Eventually Visible in Materialize + +## Summary + +Every row inserted to the MySQL primary must eventually appear — with the +correct value — in the Materialize CDC source that reads from the +multithreaded MySQL replica. The pipeline is: + +``` +MySQL primary --GTID binlog--> MySQL replica (4 parallel workers) + | + Materialize CDC source + (antithesis_cluster) + | + antithesis_cdc table +``` + +## Instrumentation + +**Workload-side** — `test/antithesis/workload/test/parallel_driver_mysql_cdc.py`. + +Each `parallel_driver_` invocation: +1. Assigns a per-invocation `batch_id` prefix (Antithesis-seeded RNG). +2. Inserts `ROWS_PER_INVOCATION` (20) rows to `antithesis.cdc_test` on the + MySQL primary, recording the expected `{id → value}` map locally. +3. Requests an Antithesis quiet period (25 s) and polls `antithesis_cdc` in + Materialize until all expected rows appear or the 90 s budget expires. +4. Fires: + - `sometimes("mysql: CDC source caught up to all primary inserts after quiet period", …)` + — liveness anchor; confirms at least one invocation reaches full catchup. + - `always("mysql: CDC source row has correct value after catchup", …)` — safety; + fired once per row, catches wrong-value corruption. + - `always("mysql: CDC source row count matches inserted count after catchup", …)` + — safety; catches extra phantom rows (count > expected) or missing rows + (count < expected) at the batch level. + +**First-run setup** — `test/antithesis/workload/test/first_mysql_replica_setup.py`. + +Runs once per Antithesis timeline before any parallel drivers start: +- Creates `antithesis.cdc_test` on the primary. +- Configures the replica channel (`CHANGE REPLICATION SOURCE TO … SOURCE_AUTO_POSITION=1`). +- Sets `replica_parallel_workers = 4`, `replica_preserve_commit_order = ON`. +- Starts the replica. +- Creates the Materialize connection (`antithesis_mysql_conn`), source + (`mysql_cdc_source`), and table (`antithesis_cdc`). +- Fires `reachable("mysql: first-run setup complete …")` so Antithesis can + confirm the setup path is exercised in every timeline. +- Fires `sometimes("mysql replica: antithesis.cdc_test replicated from primary within 90s", …)` + to confirm initial replication is flowing before the source is created. + +## Why This Property Matters + +MySQL CDC via a multithreaded replica is a distinct and failure-prone code +path compared to the Kafka/upsert path that the existing drivers exercise. +Key fault scenarios exposed: + +- **Replica lag under faults** — if Antithesis kills the MySQL replica + container, the replica restarts from its persisted GTID position (the + replica data volume is persistent). The Materialize source must reconnect + and resume without dropping rows. + +- **Parallel replication ordering** — with 4 parallel workers and + `replica_preserve_commit_order=ON`, the replica applies transactions + concurrently but in primary commit order. Antithesis can inject scheduling + jitter that stresses the ordering protocol. + +- **Primary kills** — if Antithesis kills the MySQL primary, the replica + loses its upstream. Materialize's CDC source must handle the replica going + silent gracefully (not panic, not report wrong data). + +- **Materialize clusterd restarts** — the MySQL CDC source resumes from the + last committed GTID in the persist shard, similar to the Kafka source + resume-offset logic. Existing `storage-command-replay-idempotent` property + is stressed through the MySQL code path. + +## Assertion Types Chosen + +- `sometimes(…)` for liveness (catchup): the system must make progress at + least once per run. Under heavy fault injection catchup may not complete + every invocation; that's expected. We care that it succeeds at least once. + +- `always(…)` for safety (per-row value, batch count): once we've confirmed + catchup, every observable row must be correct. This is a hard safety + invariant. + +- `reachable(…)` for setup completion: ensures Antithesis counts the + first-run setup as an exercised path across the run. + +## Related Properties + +- `storage-command-replay-idempotent` — MySQL CDC resume on clusterd restart + exercises the same command-history replay path as Kafka sources. +- `fault-recovery-exercised` — the `sometimes(…)` recovery probe also fires + after MySQL-induced coordinator failures. +- `kafka-source-survives-clusterd-restart` — shares the "source resumes after + storage worker kill" structure; MySQL adds the replica-replication dimension. + +## Schema + +```sql +-- MySQL (primary and replica via replication): +CREATE TABLE antithesis.cdc_test ( + id VARCHAR(64) NOT NULL PRIMARY KEY, + batch_id VARCHAR(64) NOT NULL, + value TEXT NOT NULL, + updated_at TIMESTAMP(6) DEFAULT CURRENT_TIMESTAMP(6) + ON UPDATE CURRENT_TIMESTAMP(6) +); + +-- Materialize: +CREATE SECRET antithesis_mysql_password AS '…'; +CREATE CONNECTION antithesis_mysql_conn TO MYSQL ( + HOST 'mysql-replica', USER 'root', + PASSWORD SECRET antithesis_mysql_password +); +CREATE SOURCE mysql_cdc_source IN CLUSTER antithesis_cluster + FROM MYSQL CONNECTION antithesis_mysql_conn; +CREATE TABLE antithesis_cdc + FROM SOURCE mysql_cdc_source (REFERENCE antithesis.cdc_test); +``` diff --git a/test/antithesis/scratchbook/property-catalog.md b/test/antithesis/scratchbook/property-catalog.md index b09ceb0a00e7c..2c308cf3e2e2b 100644 --- a/test/antithesis/scratchbook/property-catalog.md +++ b/test/antithesis/scratchbook/property-catalog.md @@ -2,6 +2,7 @@ commit: 007c7af9d9970fb2030c7212368b232e0fbc363e updated: 2026-05-12 --- + # Property Catalog: Materialize @@ -404,6 +405,25 @@ Properties specific to the Kafka source ingestion pipeline: `KafkaSourceReader` | **Antithesis Angle** | Inject persist consensus latency, kill+restart concurrently to create a competing writer, race the metadata fetcher's partition-add against a mint that is already in flight. The retry loop in `mint()` has no upper bound; this property confirms it is not livelocked even under adversarial schedules. | | **Why It Matters** | A livelocked mint loop manifests as a source that never advances its frontier — externally indistinguishable from a stalled Kafka consumer, but caused inside Materialize. | +## Category 8: MySQL CDC Source + +Properties specific to Materialize's MySQL CDC source pipeline, which reads +from a multithreaded MySQL replica. The topology adds a MySQL primary (GTID + +WRITESET dependency tracking) and a MySQL replica (4 parallel workers, +commit-order preservation) to the Antithesis environment. + +### mysql-source-no-data-loss — Every Row Written to MySQL Primary Is Eventually Visible + +| | | +|---|---| +| **Type** | Liveness + Safety | +| **Priority** | P1 — end-to-end correctness of the MySQL CDC pipeline; tests a distinct code path from Kafka | +| **Status** | **Implemented (workload-side)** — `test/antithesis/workload/test/parallel_driver_mysql_cdc.py` + `first_mysql_replica_setup.py`. Each `parallel_driver_` invocation inserts 20 rows to MySQL primary, waits for a quiet period, then polls `antithesis_cdc` until all rows appear (or 90 s budget expires). `always("mysql: CDC source row has correct value after catchup", …)` and `always("mysql: CDC source row count matches inserted count after catchup", …)` fire per-row and per-batch after confirmed catchup; `sometimes("mysql: CDC source caught up to all primary inserts after quiet period", …)` is the liveness anchor. The `first_mysql_replica_setup.py` creates the MySQL schema, configures multithreaded replication (4 workers, `replica_preserve_commit_order=ON`), and creates the Materialize connection/source/table, firing `reachable("mysql: first-run setup complete …")` as a coverage anchor. | +| **Property** | After inserting a row to the MySQL primary (via the binlog + GTID-based multithreaded replica), the Materialize CDC source eventually contains that row with the correct value. | +| **Invariant** | `Always`: after catchup, for every row inserted to `antithesis.cdc_test` on the primary, `SELECT value FROM antithesis_cdc WHERE id = ?` returns the expected value. `Sometimes`: catchup completes within the quiet-period budget at least once per run. | +| **Antithesis Angle** | Kills to the MySQL replica container (replica restarts from persisted GTID position); kills to the MySQL primary (replica and Materialize source must handle upstream silence gracefully); clusterd restarts (MySQL CDC resume exercises the same `storage-command-replay-idempotent` path as Kafka); parallel worker scheduling jitter that stresses the `replica_preserve_commit_order` protocol. | +| **Why It Matters** | MySQL CDC is a distinct ingestion code path from Kafka. Wrong behavior here — dropped rows, wrong values after restart, duplicate rows after resume — is not caught by the Kafka-source drivers. | + ### offset-known-not-below-committed — Source Statistics Causality | | | diff --git a/test/antithesis/workload/Dockerfile b/test/antithesis/workload/Dockerfile index b72a6b541d818..5cca619ed8234 100644 --- a/test/antithesis/workload/Dockerfile +++ b/test/antithesis/workload/Dockerfile @@ -22,7 +22,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ RUN pip install --no-cache-dir \ psycopg[binary]==3.2.9 \ confluent-kafka==2.8.0 \ - antithesis==0.2.0 + antithesis==0.2.0 \ + PyMySQL==1.1.1 # setup-complete script COPY setup-complete.sh /usr/local/bin/setup-complete.sh diff --git a/test/antithesis/workload/test/first_mysql_replica_setup.py b/test/antithesis/workload/test/first_mysql_replica_setup.py new file mode 100644 index 0000000000000..4380b5f4bd40d --- /dev/null +++ b/test/antithesis/workload/test/first_mysql_replica_setup.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Antithesis first_ command: configure MySQL multithreaded replica replication +and create the Materialize MySQL CDC source. + +Runs once per Antithesis timeline before any parallel/singleton drivers start. +Steps: + 1. Wait for both MySQL containers to accept connections. + 2. Create the `antithesis` database and `cdc_test` table on the primary. + 3. Configure the replica to replicate from the primary via GTID with 4 + parallel worker threads (multithreaded replication). + 4. Start the replica. + 5. Wait for `antithesis.cdc_test` to appear on the replica (confirms + replication is flowing). + 6. Create the Materialize connection, source, and table from the replica. +""" + +from __future__ import annotations + +import logging +import sys +import time + +import helper_mysql +from helper_mysql_source import ensure_mysql_cdc_source + +from antithesis.assertions import reachable, sometimes + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" +) +LOG = logging.getLogger("first.mysql_replica_setup") + + +def setup_primary() -> None: + """Create the antithesis schema and cdc_test table on the MySQL primary.""" + LOG.info("creating antithesis database and cdc_test table on primary") + helper_mysql.execute_primary("CREATE DATABASE IF NOT EXISTS antithesis") + helper_mysql.execute_primary( + """ + CREATE TABLE IF NOT EXISTS antithesis.cdc_test ( + id VARCHAR(64) NOT NULL PRIMARY KEY, + batch_id VARCHAR(64) NOT NULL, + value TEXT NOT NULL, + updated_at TIMESTAMP(6) DEFAULT CURRENT_TIMESTAMP(6) + ON UPDATE CURRENT_TIMESTAMP(6) + ) + """, + database="antithesis", + ) + LOG.info("antithesis.cdc_test ready on primary") + + +def configure_replica() -> None: + """Configure the MySQL replica to replicate from the primary. + + Uses GTID auto-positioning with 4 parallel workers. The replica starts + with --skip-replica-start so we configure the channel before starting. + Idempotent: stops and resets any existing channel first. + """ + LOG.info( + "configuring replica to replicate from %s with 4 parallel workers", + helper_mysql.MYSQL_HOST, + ) + # Stop and reset any existing channel (no-op on a fresh container). + try: + helper_mysql.execute_replica("STOP REPLICA") + except Exception: # noqa: BLE001 + pass + try: + helper_mysql.execute_replica("RESET REPLICA ALL") + except Exception: # noqa: BLE001 + pass + + helper_mysql.execute_replica( + f"CHANGE REPLICATION SOURCE TO " + f"SOURCE_HOST='{helper_mysql.MYSQL_HOST}', " + f"SOURCE_USER='root', " + f"SOURCE_PASSWORD='{helper_mysql.MYSQL_PASSWORD}', " + f"SOURCE_AUTO_POSITION=1, " + f"GET_SOURCE_PUBLIC_KEY=1" + ) + # Set parallel replication parameters before starting. + helper_mysql.execute_replica("SET GLOBAL replica_parallel_workers = 4") + helper_mysql.execute_replica("SET GLOBAL replica_preserve_commit_order = ON") + helper_mysql.execute_replica("START REPLICA") + LOG.info("MySQL replica started") + + +def wait_for_replica_table(timeout_s: float = 90.0) -> bool: + """Wait until antithesis.cdc_test is visible on the replica. + + Returns True when the table appears (replication is flowing), False on + timeout. + """ + deadline = time.monotonic() + timeout_s + while time.monotonic() < deadline: + try: + rows = helper_mysql.query_replica( + "SELECT 1 FROM information_schema.tables " + "WHERE table_schema = 'antithesis' AND table_name = 'cdc_test'", + ) + if rows: + LOG.info("antithesis.cdc_test visible on replica — replication flowing") + return True + except Exception as exc: # noqa: BLE001 + LOG.info("waiting for replica table: %s", exc) + time.sleep(2) + LOG.warning("timed out waiting for antithesis.cdc_test on replica") + return False + + +def main() -> int: + LOG.info("waiting for MySQL primary (%s)...", helper_mysql.MYSQL_HOST) + helper_mysql.wait_for_primary() + + LOG.info("waiting for MySQL replica (%s)...", helper_mysql.MYSQL_REPLICA_HOST) + helper_mysql.wait_for_replica() + + setup_primary() + configure_replica() + + replica_ready = wait_for_replica_table() + sometimes( + replica_ready, + "mysql replica: antithesis.cdc_test replicated from primary within 90s", + { + "primary": helper_mysql.MYSQL_HOST, + "replica": helper_mysql.MYSQL_REPLICA_HOST, + }, + ) + if not replica_ready: + # Proceed anyway — replication may catch up before Materialize tries to + # validate the source, but log a warning so triage can correlate. + LOG.warning("replica table not yet visible; proceeding with source creation") + + ensure_mysql_cdc_source() + + reachable( + "mysql: first-run setup complete — replica configured, Materialize source created", + { + "primary": helper_mysql.MYSQL_HOST, + "replica": helper_mysql.MYSQL_REPLICA_HOST, + }, + ) + LOG.info("MySQL CDC setup complete") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/antithesis/workload/test/helper_mysql.py b/test/antithesis/workload/test/helper_mysql.py new file mode 100644 index 0000000000000..e99b3656cb4dd --- /dev/null +++ b/test/antithesis/workload/test/helper_mysql.py @@ -0,0 +1,159 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""MySQL connection helpers for Antithesis drivers. + +Connects to the MySQL primary and replica via PyMySQL. All calls retry +transient network and operational errors up to a fixed budget so the +workload keeps progressing under fault injection. +""" + +from __future__ import annotations + +import logging +import os +import time + +import pymysql +import pymysql.cursors + +LOG = logging.getLogger("antithesis.helper_mysql") + +MYSQL_HOST = os.environ.get("MYSQL_HOST", "mysql") +MYSQL_REPLICA_HOST = os.environ.get("MYSQL_REPLICA_HOST", "mysql-replica") +MYSQL_PORT = int(os.environ.get("MYSQL_PORT", "3306")) +MYSQL_PASSWORD = os.environ.get("MYSQL_PASSWORD", "p@ssw0rd") + +_RETRY_BUDGET_S = 120 +_RETRY_INITIAL_S = 0.5 +_RETRY_MAX_S = 4.0 + + +def _retryable(exc: BaseException) -> bool: + return isinstance(exc, (pymysql.OperationalError, pymysql.InterfaceError)) + + +def _open(host: str, database: str) -> pymysql.connections.Connection: + """Open a single MySQL connection with retries on transient errors.""" + deadline = time.monotonic() + _RETRY_BUDGET_S + backoff = _RETRY_INITIAL_S + while True: + try: + return pymysql.connect( + host=host, + port=MYSQL_PORT, + user="root", + password=MYSQL_PASSWORD, + database=database, + connect_timeout=15, + autocommit=True, + ) + except Exception as exc: # noqa: BLE001 + if not _retryable(exc) or time.monotonic() > deadline: + raise + LOG.info( + "mysql connect to %s retrying after %s; backoff=%.2fs", + host, + exc, + backoff, + ) + time.sleep(backoff) + backoff = min(backoff * 2, _RETRY_MAX_S) + + +def _execute(host: str, sql: str, params: tuple = (), database: str = "mysql") -> None: + deadline = time.monotonic() + _RETRY_BUDGET_S + backoff = _RETRY_INITIAL_S + while True: + try: + conn = _open(host, database) + with conn.cursor() as cur: + cur.execute(sql, params) + conn.close() + return + except Exception as exc: # noqa: BLE001 + if not _retryable(exc) or time.monotonic() > deadline: + raise + LOG.info("mysql execute on %s retrying after %s", host, exc) + time.sleep(backoff) + backoff = min(backoff * 2, _RETRY_MAX_S) + + +def _query( + host: str, sql: str, params: tuple = (), database: str = "mysql" +) -> list[tuple]: + deadline = time.monotonic() + _RETRY_BUDGET_S + backoff = _RETRY_INITIAL_S + while True: + try: + conn = _open(host, database) + with conn.cursor() as cur: + cur.execute(sql, params) + result = list(cur.fetchall()) + conn.close() + return result + except Exception as exc: # noqa: BLE001 + if not _retryable(exc) or time.monotonic() > deadline: + raise + LOG.info("mysql query on %s retrying after %s", host, exc) + time.sleep(backoff) + backoff = min(backoff * 2, _RETRY_MAX_S) + + +def execute_primary(sql: str, params: tuple = (), database: str = "mysql") -> None: + """Execute a statement on the MySQL primary.""" + _execute(MYSQL_HOST, sql, params, database) + + +def execute_replica(sql: str, params: tuple = (), database: str = "mysql") -> None: + """Execute a statement on the MySQL replica.""" + _execute(MYSQL_REPLICA_HOST, sql, params, database) + + +def query_primary( + sql: str, params: tuple = (), database: str = "mysql" +) -> list[tuple]: + """Run a query on the MySQL primary and return all rows.""" + return _query(MYSQL_HOST, sql, params, database) + + +def query_replica( + sql: str, params: tuple = (), database: str = "mysql" +) -> list[tuple]: + """Run a query on the MySQL replica and return all rows.""" + return _query(MYSQL_REPLICA_HOST, sql, params, database) + + +def wait_for_host(host: str, timeout_s: float = 180.0) -> None: + """Block until MySQL on `host` accepts connections.""" + deadline = time.monotonic() + timeout_s + while time.monotonic() < deadline: + try: + conn = pymysql.connect( + host=host, + port=MYSQL_PORT, + user="root", + password=MYSQL_PASSWORD, + connect_timeout=5, + ) + conn.close() + LOG.info("mysql %s is ready", host) + return + except Exception as exc: # noqa: BLE001 + LOG.info("waiting for mysql %s: %s", host, exc) + time.sleep(2) + raise TimeoutError(f"MySQL at {host} not ready after {timeout_s}s") + + +def wait_for_primary(timeout_s: float = 180.0) -> None: + wait_for_host(MYSQL_HOST, timeout_s) + + +def wait_for_replica(timeout_s: float = 180.0) -> None: + wait_for_host(MYSQL_REPLICA_HOST, timeout_s) diff --git a/test/antithesis/workload/test/helper_mysql_source.py b/test/antithesis/workload/test/helper_mysql_source.py new file mode 100644 index 0000000000000..6572eddc9c7e4 --- /dev/null +++ b/test/antithesis/workload/test/helper_mysql_source.py @@ -0,0 +1,97 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Idempotent setup for the Antithesis MySQL CDC source in Materialize. + +The MySQL CDC pipeline: + mysql (primary) --binlog--> mysql-replica --CDC--> Materialize + +Materialize reads from the replica so that faults to the replica exercise +the Materialize source recovery path independently of faults to the primary. + +Objects created in Materialize: + - SECRET antithesis_mysql_password + - CONNECTION antithesis_mysql_conn -> mysql-replica + - SOURCE mysql_cdc_source (IN CLUSTER antithesis_cluster) + - TABLE antithesis_cdc (REFERENCE antithesis.cdc_test) +""" + +from __future__ import annotations + +import logging +import os + +import psycopg + +from helper_pg import create_source_idempotent, execute_retry, query_retry + +LOG = logging.getLogger("antithesis.helper_mysql_source") + +CLUSTER = os.environ.get("MZ_ANTITHESIS_CLUSTER", "antithesis_cluster") +MYSQL_REPLICA_HOST = os.environ.get("MYSQL_REPLICA_HOST", "mysql-replica") +MYSQL_PASSWORD = os.environ.get("MYSQL_PASSWORD", "p@ssw0rd") + +MYSQL_DATABASE = "antithesis" +MYSQL_TABLE = "cdc_test" + +SECRET_NAME = "antithesis_mysql_password" +CONNECTION_NAME = "antithesis_mysql_conn" +SOURCE_NAME = "mysql_cdc_source" +TABLE_NAME = "antithesis_cdc" + + +def ensure_mysql_connection() -> None: + """Create the MySQL secret and connection in Materialize (idempotent).""" + execute_retry( + f"CREATE SECRET IF NOT EXISTS {SECRET_NAME} AS '{MYSQL_PASSWORD}'" + ) + execute_retry( + f"CREATE CONNECTION IF NOT EXISTS {CONNECTION_NAME} TO MYSQL (" + f"HOST '{MYSQL_REPLICA_HOST}', " + f"USER 'root', " + f"PASSWORD SECRET {SECRET_NAME}" + f")" + ) + LOG.info("mysql connection %s ready (replica=%s)", CONNECTION_NAME, MYSQL_REPLICA_HOST) + + +def ensure_mysql_cdc_table() -> None: + """Create the Materialize table from the MySQL CDC source (idempotent).""" + try: + execute_retry( + f"CREATE TABLE IF NOT EXISTS {TABLE_NAME} " + f"FROM SOURCE {SOURCE_NAME} " + f"(REFERENCE {MYSQL_DATABASE}.{MYSQL_TABLE})" + ) + except psycopg.errors.InternalError as exc: + if "already exists" not in str(exc): + raise + rows = query_retry("SELECT 1 FROM mz_tables WHERE name = %s", (TABLE_NAME,)) + if rows: + LOG.info("table %s landed concurrently; tolerating collision", TABLE_NAME) + return + raise + LOG.info("mysql cdc table %s ready", TABLE_NAME) + + +def ensure_mysql_cdc_source() -> None: + """Create the full MySQL CDC pipeline in Materialize (idempotent). + + Requires antithesis.cdc_test to already exist on the MySQL replica. + Call first_mysql_replica_setup.py before this in any standalone use. + """ + ensure_mysql_connection() + create_source_idempotent( + f"CREATE SOURCE IF NOT EXISTS {SOURCE_NAME} " + f"IN CLUSTER {CLUSTER} " + f"FROM MYSQL CONNECTION {CONNECTION_NAME}", + SOURCE_NAME, + ) + LOG.info("mysql cdc source %s ready", SOURCE_NAME) + ensure_mysql_cdc_table() diff --git a/test/antithesis/workload/test/parallel_driver_mysql_cdc.py b/test/antithesis/workload/test/parallel_driver_mysql_cdc.py new file mode 100644 index 0000000000000..67a9627e1e386 --- /dev/null +++ b/test/antithesis/workload/test/parallel_driver_mysql_cdc.py @@ -0,0 +1,223 @@ +#!/usr/bin/env python3 +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Antithesis driver for property `mysql-source-no-data-loss`. + +Every row inserted to the MySQL primary must eventually appear — with the +correct value — in the Materialize CDC source that reads from the +multithreaded replica. + +Each invocation: + 1. Checks the MySQL CDC source exists (created by first_mysql_replica_setup). + 2. Picks a per-invocation `batch_id` prefix so concurrent drivers don't + collide. + 3. Inserts ROWS_PER_INVOCATION rows to the MySQL primary, recording the + expected {id → value} map locally. + 4. Requests an Antithesis quiet period and polls the Materialize source + table until all expected rows appear (or the budget expires). + 5. Asserts correctness via `always(...)` on count and per-row values. + A `sometimes(...)` liveness anchor fires on successful catchup. + +This is a `parallel_driver_` — Antithesis runs many concurrent instances. +Each assigns itself a fresh prefix from the Antithesis-seeded RNG so +parallel drivers exercise the MySQL CDC path simultaneously without +interfering with each other's expected-state model. +""" + +from __future__ import annotations + +import logging +import sys +import time + +import helper_mysql +import helper_random +from helper_mysql_source import SOURCE_NAME, TABLE_NAME +from helper_pg import query_retry +from helper_quiet import request_quiet_period + +from antithesis.assertions import always, sometimes + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" +) +LOG = logging.getLogger("driver.mysql_cdc") + +ROWS_PER_INVOCATION = 20 +QUIET_PERIOD_S = 25 +CATCHUP_TIMEOUT_S = 90.0 +POLL_INTERVAL_S = 1.0 + + +def _source_exists() -> bool: + rows = query_retry("SELECT 1 FROM mz_sources WHERE name = %s", (SOURCE_NAME,)) + return bool(rows) + + +def _insert_rows(batch_id: str) -> dict[str, str]: + """Insert ROWS_PER_INVOCATION rows to the MySQL primary. + + Returns {id → value} for every successfully inserted row. + """ + expected: dict[str, str] = {} + for i in range(ROWS_PER_INVOCATION): + row_id = f"{batch_id}:{i}" + value = f"v{helper_random.random_int(0, 9999):04d}" + try: + helper_mysql.execute_primary( + "INSERT INTO antithesis.cdc_test (id, batch_id, value) " + "VALUES (%s, %s, %s) " + "ON DUPLICATE KEY UPDATE value = VALUES(value), batch_id = VALUES(batch_id)", + (row_id, batch_id, value), + database="antithesis", + ) + expected[row_id] = value + except Exception as exc: # noqa: BLE001 + # Under fault injection a write to the primary may fail. Skip the + # row rather than crashing so the driver keeps inserting others. + LOG.info("insert failed for row %s: %s; skipping", row_id, exc) + return expected + + +def _wait_for_catchup(batch_id: str, expected_count: int) -> bool: + """Poll Materialize until all expected rows for `batch_id` appear. + + Returns True when `COUNT(*) WHERE batch_id = ?` reaches expected_count, + False on timeout. + """ + deadline = time.monotonic() + CATCHUP_TIMEOUT_S + last_seen = -1 + while time.monotonic() < deadline: + try: + rows = query_retry( + f"SELECT COUNT(*)::bigint FROM {TABLE_NAME} WHERE batch_id = %s", + (batch_id,), + ) + count = int(rows[0][0]) if rows and rows[0][0] is not None else 0 + except Exception as exc: # noqa: BLE001 + LOG.info("catchup poll failed: %s; retrying", exc) + time.sleep(POLL_INTERVAL_S) + continue + + if count != last_seen: + LOG.info( + "mysql cdc catchup: batch=%s observed=%d target=%d", + batch_id, + count, + expected_count, + ) + last_seen = count + + if count >= expected_count: + return True + time.sleep(POLL_INTERVAL_S) + + LOG.warning( + "mysql cdc catchup timeout: batch=%s last_seen=%d target=%d", + batch_id, + last_seen, + expected_count, + ) + return False + + +def _check_rows(expected: dict[str, str]) -> None: + """Assert every expected row has the correct value in the Materialize source.""" + for row_id, want in expected.items(): + rows = query_retry( + f"SELECT value FROM {TABLE_NAME} WHERE id = %s", + (row_id,), + ) + found = bool(rows) + observed = rows[0][0] if found else None + always( + found and observed == want, + "mysql: CDC source row has correct value after catchup", + { + "source": TABLE_NAME, + "id": row_id, + "expected_value": want, + "observed_present": found, + "observed_value": observed, + }, + ) + + +def main() -> int: + if not _source_exists(): + # first_mysql_replica_setup must run before this driver. Outside + # Antithesis (e.g. snouty validate) the source may not exist yet — + # exit cleanly rather than erroring so validate can still proceed. + LOG.warning( + "mysql cdc source %s not found; skipping " + "(first_mysql_replica_setup must run first)", + SOURCE_NAME, + ) + return 0 + + batch_id = f"p{helper_random.random_u64():016x}" + LOG.info("driver starting; batch_id=%s", batch_id) + + expected = _insert_rows(batch_id) + if not expected: + LOG.info("no rows inserted successfully this invocation; exiting cleanly") + return 0 + + LOG.info("inserted %d rows; requesting quiet period", len(expected)) + request_quiet_period(QUIET_PERIOD_S) + + caught_up = _wait_for_catchup(batch_id, len(expected)) + + # Liveness anchor: at least one invocation should fully catch up. If this + # never fires across an entire run the safety assertions below are vacuous. + sometimes( + caught_up, + "mysql: CDC source caught up to all primary inserts after quiet period", + { + "source": TABLE_NAME, + "batch_id": batch_id, + "rows_inserted": len(expected), + }, + ) + + if not caught_up: + # Don't run per-row safety assertions on stale data — a slow catchup + # is a separate concern from row-level correctness. + LOG.info("catchup did not complete in budget; skipping per-row assertions") + return 0 + + # Safety: every row we inserted must be present with the correct value. + _check_rows(expected) + + # Count-level safety check: no extra rows for our batch_id should exist. + rows = query_retry( + f"SELECT COUNT(*)::bigint FROM {TABLE_NAME} WHERE batch_id = %s", + (batch_id,), + ) + count_in_mz = int(rows[0][0]) if rows and rows[0][0] is not None else 0 + always( + count_in_mz == len(expected), + "mysql: CDC source row count matches inserted count after catchup", + { + "source": TABLE_NAME, + "batch_id": batch_id, + "expected_count": len(expected), + "observed_count": count_in_mz, + }, + ) + + LOG.info( + "driver done; asserted on %d rows for batch_id=%s", len(expected), batch_id + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From 8dedd7b9a42eae11e4a277cccb7d34154251830b Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Tue, 12 May 2026 14:23:29 -0400 Subject: [PATCH 32/65] test/antithesis: clusterd workers=4 per replica to exercise multi-worker thread pausing --- test/antithesis/config/.env | 24 +++++++++++++++---- test/antithesis/config/docker-compose.yaml | 9 ++++--- test/antithesis/mzcompose.py | 14 ++++++++++- .../workload/workload-entrypoint.sh | 4 ++-- 4 files changed, 39 insertions(+), 12 deletions(-) diff --git a/test/antithesis/config/.env b/test/antithesis/config/.env index 92cddafe58f23..d4f160a98596f 100644 --- a/test/antithesis/config/.env +++ b/test/antithesis/config/.env @@ -1,5 +1,21 @@ -# GENERATED FILE — do not edit. Regenerate via: +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +# Compose env-file for `test/antithesis/config/docker-compose.yaml`. +# Tracked by git only so that the file exists for mzbuild's input +# fingerprinting and survives `git clean -ffdX` between builds. The +# committed values are placeholders — `build-antithesis.sh` overwrites +# them in CI with refs to images pushed to Antithesis's GCP Artifact +# Registry, and `make export-env` does the same with local-dev refs. +# +# If you see these placeholder values on a running cluster, your build +# pipeline did not regenerate this file. Run: # bin/pyactivate test/antithesis/export-env.py > test/antithesis/config/.env -# Consumed by test/antithesis/config/docker-compose.yaml at compose-parse time. -MATERIALIZED_IMAGE=ghcr.io/materializeinc/materialize/materialized:mzbuild-EMRA5ARAVQMKNFJIHZJTAPOEWMAGW5TX -ANTITHESIS_WORKLOAD_IMAGE=ghcr.io/materializeinc/materialize/antithesis-workload:mzbuild-YKN4ZHJT7YAPYQLQV5BYTUPCRY2RUDRI +MATERIALIZED_IMAGE=placeholder-not-built +ANTITHESIS_WORKLOAD_IMAGE=placeholder-not-built diff --git a/test/antithesis/config/docker-compose.yaml b/test/antithesis/config/docker-compose.yaml index 97d982367e63b..b9c383be7f1a5 100644 --- a/test/antithesis/config/docker-compose.yaml +++ b/test/antithesis/config/docker-compose.yaml @@ -260,10 +260,10 @@ services: - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879 - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0 - CLUSTERD_PROCESS=0 - - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 1, "process": 0, "addresses": ["clusterd1:2102"], + - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd1:2102"], "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc": false, "zero_copy_limit": null}' - - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 1, "process": 0, "addresses": ["clusterd1:2103"], + - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd1:2103"], "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc": false, "zero_copy_limit": null}' volumes: @@ -299,10 +299,10 @@ services: - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879 - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0 - CLUSTERD_PROCESS=0 - - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 1, "process": 0, "addresses": ["clusterd2:2102"], + - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd2:2102"], "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc": false, "zero_copy_limit": null}' - - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 1, "process": 0, "addresses": ["clusterd2:2103"], + - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd2:2103"], "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc": false, "zero_copy_limit": null}' volumes: @@ -484,7 +484,6 @@ services: - MZ_NO_BUILTIN_POSTGRES=1 - MZ_NO_BUILTIN_COCKROACH=1 - MZ_ADAPTER_STASH_URL=postgres://root@postgres-metadata:26257?options=--search_path=adapter - - MZ_LICENSE_KEY=/license_key/license_key volumes: - mzdata:/mzdata - mydata:/var/lib/mysql-files diff --git a/test/antithesis/mzcompose.py b/test/antithesis/mzcompose.py index c799269cbd216..fbcfa2da9fbb5 100644 --- a/test/antithesis/mzcompose.py +++ b/test/antithesis/mzcompose.py @@ -116,9 +116,19 @@ def __init__(self) -> None: # Antithesis kill either replica's backing container without taking # the workload offline. # + # `workers=4` per clusterd means each replica runs four timely worker + # threads in one process. The extra intra-process parallelism is the + # surface area Antithesis's thread-pausing fault targets — with a + # single worker, "pause one thread" effectively pauses the whole + # process, which the container-pause fault already covers. The matching + # `WORKERS 4` in the CREATE CLUSTER REPLICAS statement must stay in + # lockstep with this value (it's read by the controller, not by + # clusterd). + # # Each clusterd MUST have its own /scratch volume — the upsert # operator's RocksDB state lives there and takes an exclusive file - # lock. The DEFAULT_MZ_VOLUMES list uses a single named volume + # lock per worker (`/scratch/storage/upsert///LOCK`). + # The DEFAULT_MZ_VOLUMES list uses a single named volume # `scratch:/scratch` shared across containers; passing per-instance # named volumes (`clusterd1_scratch`, `clusterd2_scratch`) keeps the # locks separate while leaving the other volumes shared. Found via @@ -128,6 +138,7 @@ def __init__(self) -> None: # corrupted the upsert state. Clusterd( name="clusterd1", + workers=4, volumes=[ "mzdata:/mzdata", "mydata:/var/lib/mysql-files", @@ -137,6 +148,7 @@ def __init__(self) -> None: ), Clusterd( name="clusterd2", + workers=4, volumes=[ "mzdata:/mzdata", "mydata:/var/lib/mysql-files", diff --git a/test/antithesis/workload/workload-entrypoint.sh b/test/antithesis/workload/workload-entrypoint.sh index ce6e664a2c0de..1a8aab5234f51 100755 --- a/test/antithesis/workload/workload-entrypoint.sh +++ b/test/antithesis/workload/workload-entrypoint.sh @@ -48,14 +48,14 @@ CREATE CLUSTER ${CLUSTER} REPLICAS ( STORAGE ADDRESSES ['clusterd1:2103'], COMPUTECTL ADDRESSES ['clusterd1:2101'], COMPUTE ADDRESSES ['clusterd1:2102'], - WORKERS 1 + WORKERS 4 ), replica2 ( STORAGECTL ADDRESSES ['clusterd2:2100'], STORAGE ADDRESSES ['clusterd2:2103'], COMPUTECTL ADDRESSES ['clusterd2:2101'], COMPUTE ADDRESSES ['clusterd2:2102'], - WORKERS 1 + WORKERS 4 ) ); GRANT ALL ON CLUSTER ${CLUSTER} TO ${PGUSER}; From d56e33aa450cb4001ca2a025b3f68b889e64b3ec Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Tue, 12 May 2026 14:52:21 -0400 Subject: [PATCH 33/65] test/antithesis: drop --binlog_transaction_dependency_tracking; removed in MySQL 8.4 (WRITESET is the default) --- test/antithesis/config/docker-compose.yaml | 1 - test/antithesis/mzcompose.py | 10 ++++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/test/antithesis/config/docker-compose.yaml b/test/antithesis/config/docker-compose.yaml index b9c383be7f1a5..0a9c072b81aad 100644 --- a/test/antithesis/config/docker-compose.yaml +++ b/test/antithesis/config/docker-compose.yaml @@ -187,7 +187,6 @@ services: - --binlog-row-metadata=full - --server-id=1 - --max-connections=500 - - --binlog_transaction_dependency_tracking=WRITESET healthcheck: test: - CMD diff --git a/test/antithesis/mzcompose.py b/test/antithesis/mzcompose.py index fbcfa2da9fbb5..584f39852dd12 100644 --- a/test/antithesis/mzcompose.py +++ b/test/antithesis/mzcompose.py @@ -82,16 +82,18 @@ def __init__(self) -> None: Zookeeper(), Kafka(auto_create_topics=True), SchemaRegistry(), - # MySQL primary — GTID-enabled with WRITESET dependency tracking so the - # replica can safely use parallel workers without losing commit order. + # MySQL primary — GTID-enabled. WRITESET binlog dependency tracking + # is what lets the replica run parallel workers without losing commit + # order; in MySQL 8.4+ WRITESET is the default and the explicit knob + # was removed (`binlog_transaction_dependency_tracking` is unknown + # past 8.4, and the antithesis image is `mysql:9.5.0`). MySql( use_seeded_image=False, volumes=[ "mysqldata_primary:/var/lib/mysql", "mydata:/var/lib/mysql-files", ], - additional_args=create_mysql_server_args(server_id="1", is_master=True) - + ["--binlog_transaction_dependency_tracking=WRITESET"], + additional_args=create_mysql_server_args(server_id="1", is_master=True), ), # MySQL replica — multithreaded replication (4 workers, commit-order # preserved). Replication is configured at runtime by From 445f452d0b64347f8240c48027c739a6a010b856 Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Tue, 12 May 2026 18:25:01 -0400 Subject: [PATCH 34/65] test/antithesis: drop --scratch-directory from clusterd so upsert RocksDB uses mem_env (production behavior) --- .../mzcompose/services/clusterd.py | 10 ++++-- test/antithesis/config/docker-compose.yaml | 8 ++--- test/antithesis/mzcompose.py | 34 ++++++------------- 3 files changed, 21 insertions(+), 31 deletions(-) diff --git a/misc/python/materialize/mzcompose/services/clusterd.py b/misc/python/materialize/mzcompose/services/clusterd.py index e07ca490a5355..bffe3ddc3e470 100644 --- a/misc/python/materialize/mzcompose/services/clusterd.py +++ b/misc/python/materialize/mzcompose/services/clusterd.py @@ -28,7 +28,7 @@ def __init__( options: list[str] = [], restart: str = "no", stop_grace_period: str = "120s", - scratch_directory: str = "/scratch", + scratch_directory: str | None = "/scratch", volumes: list[str] = [], workers: int = 1, process_names: list[str] = [], @@ -68,7 +68,13 @@ def __init__( f"CLUSTERD_STORAGE_TIMELY_CONFIG={storage_timely_config}", ] - options = ["clusterd", f"--scratch-directory={scratch_directory}", *options] + # `scratch_directory=None` omits the CLI flag entirely. clusterd + # treats this as "no scratch" — RocksDB switches to its in-memory + # env (`Env::mem_env()`), matching the production deployment shape + # where cluster replicas have no scratch disk attached. + options = ["clusterd", *options] + if scratch_directory is not None: + options.insert(1, f"--scratch-directory={scratch_directory}") config: ServiceConfig = {} diff --git a/test/antithesis/config/docker-compose.yaml b/test/antithesis/config/docker-compose.yaml index 0a9c072b81aad..446c9d0a189f6 100644 --- a/test/antithesis/config/docker-compose.yaml +++ b/test/antithesis/config/docker-compose.yaml @@ -241,7 +241,6 @@ services: - -- command: - clusterd - - --scratch-directory=/scratch ports: - 2100 - 2101 @@ -269,7 +268,7 @@ services: - mzdata:/mzdata - mydata:/var/lib/mysql-files - tmp:/share/tmp - - clusterd1_scratch:/scratch + - scratch:/scratch restart: 'no' stop_grace_period: 120s platform: linux/amd64 @@ -280,7 +279,6 @@ services: - -- command: - clusterd - - --scratch-directory=/scratch ports: - 2100 - 2101 @@ -308,7 +306,7 @@ services: - mzdata:/mzdata - mydata:/var/lib/mysql-files - tmp:/share/tmp - - clusterd2_scratch:/scratch + - scratch:/scratch restart: 'no' stop_grace_period: 120s platform: linux/amd64 @@ -548,5 +546,3 @@ volumes: scratch: null mysqldata_primary: null mysqldata_replica: null - clusterd1_scratch: null - clusterd2_scratch: null diff --git a/test/antithesis/mzcompose.py b/test/antithesis/mzcompose.py index 584f39852dd12..5f7da9d8f0e97 100644 --- a/test/antithesis/mzcompose.py +++ b/test/antithesis/mzcompose.py @@ -127,36 +127,24 @@ def __init__(self) -> None: # lockstep with this value (it's read by the controller, not by # clusterd). # - # Each clusterd MUST have its own /scratch volume — the upsert - # operator's RocksDB state lives there and takes an exclusive file - # lock per worker (`/scratch/storage/upsert///LOCK`). - # The DEFAULT_MZ_VOLUMES list uses a single named volume - # `scratch:/scratch` shared across containers; passing per-instance - # named volumes (`clusterd1_scratch`, `clusterd2_scratch`) keeps the - # locks separate while leaving the other volumes shared. Found via - # an Antithesis run where clusterd1 deadlocked retrying to open - # `/scratch/storage/upsert/u3/0/LOCK` because clusterd2 held it, - # which then drove a continuous suspend-and-restart loop that - # corrupted the upsert state. + # `scratch_directory=None` matches production: cluster replicas in + # cloud deployments don't get a scratch disk, so the upsert operator's + # RocksDB initializes with `Env::mem_env()` and stores its state + # entirely in process memory. Passing a scratch directory would put + # us on a code path production never exercises, and would also + # require careful per-instance volume plumbing to avoid the two + # clusterds racing on the same `/scratch/storage/upsert///LOCK` + # file (which manifested as continuous Stalled/suspend-and-restart + # loops on clusterd1 in an earlier run). Clusterd( name="clusterd1", workers=4, - volumes=[ - "mzdata:/mzdata", - "mydata:/var/lib/mysql-files", - "tmp:/share/tmp", - "clusterd1_scratch:/scratch", - ], + scratch_directory=None, ), Clusterd( name="clusterd2", workers=4, - volumes=[ - "mzdata:/mzdata", - "mydata:/var/lib/mysql-files", - "tmp:/share/tmp", - "clusterd2_scratch:/scratch", - ], + scratch_directory=None, ), Materialized( external_blob_store=True, From 492c30a6913a37197dd00aec5e555a4548feee19 Mon Sep 17 00:00:00 2001 From: Dennis Felsing Date: Wed, 13 May 2026 15:59:05 +0800 Subject: [PATCH 35/65] Not what we want, but what we deserve? --- .../parallel-workload-no-unexpected-errors.md | 44 +++ .../scratchbook/property-catalog.md | 20 +- .../test/parallel_driver_parallel_workload.py | 366 ++++++++++++++++++ 3 files changed, 429 insertions(+), 1 deletion(-) create mode 100644 test/antithesis/scratchbook/properties/parallel-workload-no-unexpected-errors.md create mode 100644 test/antithesis/workload/test/parallel_driver_parallel_workload.py diff --git a/test/antithesis/scratchbook/properties/parallel-workload-no-unexpected-errors.md b/test/antithesis/scratchbook/properties/parallel-workload-no-unexpected-errors.md new file mode 100644 index 0000000000000..c1e0f67dbc2a8 --- /dev/null +++ b/test/antithesis/scratchbook/properties/parallel-workload-no-unexpected-errors.md @@ -0,0 +1,44 @@ +# parallel-workload-no-unexpected-errors + +## Summary +Randomized concurrent SQL against a shared pool of catalog objects should not +produce unexpected query failures, even while Antithesis injects coordinator and +replica faults. + +## Evidence + +### Code Paths +- `src/adapter/src/coord/sequencer/` — concurrent DDL/DML sequencing and catalog transactions +- `src/catalog/src/durable/` — catalog state persistence and recovery across restarts +- `src/compute/src/` — materialized-view rendering and execution after concurrent DDL + +### How It Works +The Antithesis workload uses a fixed shared schema and a small pool of tables +and materialized views. Multiple worker threads repeatedly race `CREATE`, +`DROP`, `INSERT`, `UPDATE`, `DELETE`, and `SELECT` against those objects. This +deliberately forces the coordinator through concurrent catalog changes while the +Antithesis fault injector pauses or restarts components underneath it. + +### What Goes Wrong on Violation +Unexpected SQL failures here usually mean a concurrency bug in catalog +sequencing, plan invalidation, or recovery. The workload already tolerates the +expected race outcomes like "object was dropped" or "concurrent catalog +modification"; what remains should be a real bug or an unclassified new failure +mode worth triage. + +### Workload Verification +1. Ensure the shared schema exists +2. Spawn multiple worker threads +3. Randomly issue DDL/DML/SELECT against a fixed object pool +4. Count expected race/drop errors separately +5. Assert that no other SQL error escapes + +### SUT-Side Instrumentation Notes +- Best primary signal is workload-side because the interesting failures are + externally visible query errors, not one specific internal assertion site +- Candidate follow-up: add targeted SUT-side assertions for catalog invalidation + and dropped-object dependency paths once a concrete failure mode is found + +### Provenance +Adapted from the existing `test/parallel-workload/mzcompose.py` randomized SQL +stress test into the Antithesis workload model. diff --git a/test/antithesis/scratchbook/property-catalog.md b/test/antithesis/scratchbook/property-catalog.md index 2c308cf3e2e2b..d4074c3bf7e2e 100644 --- a/test/antithesis/scratchbook/property-catalog.md +++ b/test/antithesis/scratchbook/property-catalog.md @@ -405,7 +405,25 @@ Properties specific to the Kafka source ingestion pipeline: `KafkaSourceReader` | **Antithesis Angle** | Inject persist consensus latency, kill+restart concurrently to create a competing writer, race the metadata fetcher's partition-add against a mint that is already in flight. The retry loop in `mint()` has no upper bound; this property confirms it is not livelocked even under adversarial schedules. | | **Why It Matters** | A livelocked mint loop manifests as a source that never advances its frontier — externally indistinguishable from a stalled Kafka consumer, but caused inside Materialize. | -## Category 8: MySQL CDC Source +## Category 8: Randomized Concurrency Stress + +Properties that use intentionally adversarial concurrent SQL workloads to flush +out catalog, planning, and recovery bugs that are hard to encode as a single +deterministic correctness scenario. + +### parallel-workload-no-unexpected-errors — Randomized Concurrent SQL Only Hits Expected Race Errors + +| | | +|---|---| +| **Type** | Safety | +| **Priority** | P2 — broad regression net rather than one product contract, but good at finding real crashes and catalog races | +| **Status** | **Implemented (workload-side)** — `test/antithesis/workload/test/parallel_driver_parallel_workload.py`. A shared schema plus four tables/four materialized views are stressed by multiple worker threads racing `CREATE`, `DROP`, `INSERT`, `UPDATE`, `DELETE`, and `SELECT`. The driver records `sometimes("parallel workload: randomized concurrent SQL executed successfully", …)` for liveness, `sometimes("parallel workload: DDL actions were exercised", …)` for coverage, `sometimes("parallel workload: expected concurrent-catalog races were observed", …)` to confirm the workload is hitting the intended contention paths, and one `always("parallel workload: no unexpected SQL errors escaped the randomized stress driver", …)` safety assertion for the failure signal itself. This is intentionally a subset port of `test/parallel-workload/mzcompose.py`, scoped to the existing Antithesis topology rather than the full mzcompose service matrix. | +| **Property** | Under fault injection and concurrent randomized SQL, Materialize may return expected dropped-object / concurrent-catalog errors, but it must not surface *unexpected* query failures. | +| **Invariant** | `Always`: every SQL exception raised by the randomized workload matches the driver's expected-concurrency ignore list; any uncategorized error is a property failure. | +| **Antithesis Angle** | Antithesis can pause or restart environmentd/clusterd while several client threads concurrently create/drop objects and query them. The interesting windows are plan invalidation, catalog transaction races, and recovery of half-finished DDL. | +| **Why It Matters** | This is a broad bug-finder for timing-sensitive failures that do not map cleanly to one narrow user contract but still produce visible query failures or crashes. It complements the more specific properties by covering the "something went wrong under concurrent SQL churn" space. | + +## Category 9: MySQL CDC Source Properties specific to Materialize's MySQL CDC source pipeline, which reads from a multithreaded MySQL replica. The topology adds a MySQL primary (GTID + diff --git a/test/antithesis/workload/test/parallel_driver_parallel_workload.py b/test/antithesis/workload/test/parallel_driver_parallel_workload.py new file mode 100644 index 0000000000000..46ece1c308341 --- /dev/null +++ b/test/antithesis/workload/test/parallel_driver_parallel_workload.py @@ -0,0 +1,366 @@ +#!/usr/bin/env python3 +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Antithesis-native randomized parallel SQL workload. + +This ports the *intent* of `test/parallel-workload/mzcompose.py` into the +existing Antithesis workload model without trying to ship the whole +`materialize.parallel_workload` Python stack inside the workload image. + +The driver deliberately shares a small fixed pool of objects across all +invocations and worker threads: + - one schema + - four tables + - four materialized views over those tables + +Workers race CREATE/DROP/INSERT/UPDATE/DELETE/SELECT against that pool. The +property is not result correctness; it is that concurrent randomized SQL under +fault injection should not surface *unexpected* query errors. Expected catalog +race/drop errors are counted and ignored, mirroring the philosophy of the +original parallel workload. +""" + +from __future__ import annotations + +import logging +import os +import random +import sys +import threading +import time +from collections import Counter +from dataclasses import dataclass, field +from typing import Any + +import helper_random +import psycopg +from helper_pg import PGDATABASE, PGHOST, PGPORT, PGUSER, execute_retry + +from antithesis.assertions import always, sometimes + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" +) +LOG = logging.getLogger("driver.parallel_workload") + +CLUSTER = os.environ.get("MZ_ANTITHESIS_CLUSTER", "antithesis_cluster") +SCHEMA = "antithesis_parallel_workload" + +TABLE_COUNT = 4 +WORKER_THREADS = 4 +RUNTIME_S = 25.0 +CONNECT_TIMEOUT_S = 5 +MAX_KEY = 31 +MAX_VALUE = 1000 + +EXPECTED_ERROR_SUBSTRINGS = [ + "already exists", + "does not exist", + "unknown catalog item", + "unknown schema", + "was dropped while executing a statement", + "another session modified the catalog while this DDL transaction was open", + "object state changed while transaction was in progress", + "query could not complete", + "cached plan must not change result type", + "the transaction's active cluster has been dropped", + "concurrent transaction", +] + + +@dataclass +class WorkerStats: + successes: int = 0 + reconnects: int = 0 + ignored_errors: int = 0 + actions: Counter[str] = field(default_factory=Counter) + ignored_by_reason: Counter[str] = field(default_factory=Counter) + unexpected: dict[str, Any] | None = None + + +def table_name(idx: int) -> str: + return f"{SCHEMA}.t{idx}" + + +def mv_name(idx: int) -> str: + return f"{SCHEMA}.mv{idx}" + + +def ensure_shared_objects() -> None: + execute_retry(f"CREATE SCHEMA IF NOT EXISTS {SCHEMA}") + for idx in range(2): + execute_retry( + f"CREATE TABLE IF NOT EXISTS {table_name(idx)} (" + "worker TEXT NOT NULL, " + "k BIGINT NOT NULL, " + "v BIGINT NOT NULL" + ")" + ) + + +def connect() -> psycopg.Connection[Any]: + return psycopg.connect( + host=PGHOST, + port=PGPORT, + user=PGUSER, + dbname=PGDATABASE, + connect_timeout=CONNECT_TIMEOUT_S, + autocommit=True, + ) + + +def choose_action(rng: random.Random) -> str: + return rng.choices( + [ + "create_table", + "drop_table", + "insert", + "update", + "delete", + "select_table", + "create_mv", + "drop_mv", + "select_mv", + ], + weights=[6, 2, 25, 12, 10, 20, 6, 2, 17], + k=1, + )[0] + + +def execute_action( + conn: psycopg.Connection[Any], rng: random.Random, worker_name: str, action: str +) -> None: + idx = rng.randrange(TABLE_COUNT) + table = table_name(idx) + mv = mv_name(idx) + + with conn.cursor() as cur: + if action == "create_table": + cur.execute( + f"CREATE TABLE IF NOT EXISTS {table} (" + "worker TEXT NOT NULL, " + "k BIGINT NOT NULL, " + "v BIGINT NOT NULL" + ")" + ) + elif action == "drop_table": + cur.execute(f"DROP TABLE IF EXISTS {table} CASCADE") + elif action == "insert": + cur.execute( + f"INSERT INTO {table} (worker, k, v) VALUES (%s, %s, %s)", + ( + worker_name, + rng.randint(0, MAX_KEY), + rng.randint(0, MAX_VALUE), + ), + ) + elif action == "update": + cur.execute( + f"UPDATE {table} SET v = v + 1 WHERE k = %s", + (rng.randint(0, MAX_KEY),), + ) + elif action == "delete": + cur.execute( + f"DELETE FROM {table} WHERE k = %s", + (rng.randint(0, MAX_KEY),), + ) + elif action == "select_table": + cur.execute( + f"SELECT count(*)::bigint, min(v)::bigint, max(v)::bigint FROM {table}" + ) + cur.fetchall() + elif action == "create_mv": + cur.execute( + f"CREATE MATERIALIZED VIEW IF NOT EXISTS {mv} " + f"IN CLUSTER {CLUSTER} AS " + f"SELECT worker, count(*)::bigint AS c, sum(v)::bigint AS s " + f"FROM {table} GROUP BY worker" + ) + elif action == "drop_mv": + cur.execute(f"DROP MATERIALIZED VIEW IF EXISTS {mv}") + elif action == "select_mv": + cur.execute( + f"SELECT count(*)::bigint, sum(c)::bigint, sum(s)::bigint FROM {mv}" + ) + cur.fetchall() + else: + raise ValueError(f"unknown action {action}") + + +def expected_error_reason(exc: BaseException) -> str | None: + msg = str(exc) + for candidate in EXPECTED_ERROR_SUBSTRINGS: + if candidate in msg: + return candidate + return None + + +def is_connection_error(exc: BaseException) -> bool: + return isinstance(exc, (psycopg.OperationalError, psycopg.InterfaceError)) + + +def run_worker( + worker_id: int, + seed: int, + deadline: float, + stop: threading.Event, + stats: WorkerStats, +) -> None: + rng = random.Random(seed) + worker_name = f"pw{worker_id}" + conn: psycopg.Connection[Any] | None = None + + try: + while time.monotonic() < deadline and not stop.is_set(): + if conn is None or conn.closed: + try: + conn = connect() + except Exception as exc: # noqa: BLE001 + if not is_connection_error(exc): + stats.unexpected = { + "worker": worker_name, + "action": "connect", + "error": str(exc), + } + stop.set() + return + stats.reconnects += 1 + time.sleep(rng.uniform(0.05, 0.2)) + continue + + action = choose_action(rng) + try: + execute_action(conn, rng, worker_name, action) + stats.successes += 1 + stats.actions[action] += 1 + except Exception as exc: # noqa: BLE001 + if is_connection_error(exc): + stats.reconnects += 1 + try: + conn.close() + except Exception: # noqa: BLE001 + pass + conn = None + continue + + reason = expected_error_reason(exc) + if reason is not None: + stats.ignored_errors += 1 + stats.ignored_by_reason[reason] += 1 + stats.actions[action] += 1 + continue + + stats.unexpected = { + "worker": worker_name, + "action": action, + "error": str(exc), + } + LOG.exception("unexpected parallel workload error") + stop.set() + return + + time.sleep(rng.uniform(0.005, 0.05)) + finally: + if conn is not None: + try: + conn.close() + except Exception: # noqa: BLE001 + pass + + +def main() -> int: + ensure_shared_objects() + + stop = threading.Event() + deadline = time.monotonic() + RUNTIME_S + seeds = [helper_random.random_u64() for _ in range(WORKER_THREADS)] + stats = [WorkerStats() for _ in range(WORKER_THREADS)] + threads = [ + threading.Thread( + name=f"parallel-workload-{idx}", + target=run_worker, + args=(idx, seeds[idx], deadline, stop, stats[idx]), + ) + for idx in range(WORKER_THREADS) + ] + + LOG.info("parallel workload starting; schema=%s threads=%d", SCHEMA, WORKER_THREADS) + for thread in threads: + thread.start() + for thread in threads: + thread.join() + + total_successes = sum(worker.successes for worker in stats) + total_reconnects = sum(worker.reconnects for worker in stats) + total_ignored = sum(worker.ignored_errors for worker in stats) + action_counts = Counter[str]() + ignored_by_reason = Counter[str]() + unexpected = next((worker.unexpected for worker in stats if worker.unexpected), None) + for worker in stats: + action_counts.update(worker.actions) + ignored_by_reason.update(worker.ignored_by_reason) + + sometimes( + total_successes >= WORKER_THREADS * 5, + "parallel workload: randomized concurrent SQL executed successfully", + { + "successes": total_successes, + "threads": WORKER_THREADS, + "actions": dict(action_counts), + "reconnects": total_reconnects, + }, + ) + sometimes( + action_counts["create_table"] + + action_counts["drop_table"] + + action_counts["create_mv"] + + action_counts["drop_mv"] + > 0, + "parallel workload: DDL actions were exercised", + { + "create_table": action_counts["create_table"], + "drop_table": action_counts["drop_table"], + "create_mv": action_counts["create_mv"], + "drop_mv": action_counts["drop_mv"], + }, + ) + sometimes( + total_ignored > 0, + "parallel workload: expected concurrent-catalog races were observed", + { + "ignored_errors": total_ignored, + "ignored_by_reason": dict(ignored_by_reason), + }, + ) + always( + unexpected is None, + "parallel workload: no unexpected SQL errors escaped the randomized stress driver", + { + "unexpected": unexpected, + "successes": total_successes, + "ignored_errors": total_ignored, + "reconnects": total_reconnects, + "actions": dict(action_counts), + }, + ) + + LOG.info( + "parallel workload done; successes=%d ignored=%d reconnects=%d unexpected=%s", + total_successes, + total_ignored, + total_reconnects, + unexpected, + ) + return 1 if unexpected is not None else 0 + + +if __name__ == "__main__": + _ = (PGHOST, PGPORT, PGUSER, PGDATABASE, os) + sys.exit(main()) From 972732404989cd82bf722b0cd8d513ee8c0ac3dc Mon Sep 17 00:00:00 2001 From: Dennis Felsing Date: Wed, 13 May 2026 17:26:28 +0800 Subject: [PATCH 36/65] approach #2 --- ci/test/build.py | 4 +- misc/python/materialize/mzbuild.py | 14 +- .../test/antithesis/workload/Dockerfile | 68 ++ .../test/antithesis/workload/mzbuild.yml | 39 ++ test/antithesis/workload/.gitignore | 3 + test/antithesis/workload/Dockerfile | 30 +- test/antithesis/workload/mzbuild.yml | 27 + .../stubs/materialize/mzcompose/__init__.py | 37 ++ .../materialize/mzcompose/composition.py | 31 + .../materialize/mzcompose/helpers/__init__.py | 8 + .../materialize/mzcompose/helpers/iceberg.py | 24 + .../mzcompose/services/__init__.py | 8 + .../mzcompose/services/materialized.py | 33 + .../materialize/mzcompose/services/minio.py | 21 + .../materialize/mzcompose/services/mysql.py | 22 + .../mzcompose/services/sql_server.py | 24 + .../test/anytime_fault_recovery_exercised.py | 22 +- .../test/anytime_kafka_frontier_monotonic.py | 3 +- ..._kafka_offset_known_not_below_committed.py | 3 +- ...nytime_kafka_source_resumes_after_fault.py | 3 +- .../test/first_mysql_replica_setup.py | 3 +- .../first_select_upsert_implementation.py | 3 +- test/antithesis/workload/test/helper_mysql.py | 10 +- .../workload/test/helper_mysql_source.py | 9 +- test/antithesis/workload/test/helper_pg.py | 19 +- .../parallel_driver_kafka_none_envelope.py | 3 +- ...rallel_driver_mv_reflects_table_updates.py | 3 +- .../test/parallel_driver_mysql_cdc.py | 3 +- .../test/parallel_driver_parallel_workload.py | 609 +++++++++--------- ...rallel_driver_strict_serializable_reads.py | 22 +- .../parallel_driver_upsert_latest_value.py | 3 +- ...ton_driver_catalog_recovery_consistency.py | 40 +- ...ngleton_driver_upsert_state_rehydration.py | 3 +- 33 files changed, 781 insertions(+), 373 deletions(-) create mode 100644 misc/python/test/antithesis/workload/Dockerfile create mode 100644 misc/python/test/antithesis/workload/mzbuild.yml create mode 100644 test/antithesis/workload/.gitignore create mode 100644 test/antithesis/workload/stubs/materialize/mzcompose/__init__.py create mode 100644 test/antithesis/workload/stubs/materialize/mzcompose/composition.py create mode 100644 test/antithesis/workload/stubs/materialize/mzcompose/helpers/__init__.py create mode 100644 test/antithesis/workload/stubs/materialize/mzcompose/helpers/iceberg.py create mode 100644 test/antithesis/workload/stubs/materialize/mzcompose/services/__init__.py create mode 100644 test/antithesis/workload/stubs/materialize/mzcompose/services/materialized.py create mode 100644 test/antithesis/workload/stubs/materialize/mzcompose/services/minio.py create mode 100644 test/antithesis/workload/stubs/materialize/mzcompose/services/mysql.py create mode 100644 test/antithesis/workload/stubs/materialize/mzcompose/services/sql_server.py diff --git a/ci/test/build.py b/ci/test/build.py index 95f4227afbaa7..89d9402aab08f 100755 --- a/ci/test/build.py +++ b/ci/test/build.py @@ -63,9 +63,7 @@ def main() -> None: repo.images[name] for name in antithesis_images ) else: - deps = repo.resolve_dependencies( - image for image in repo if image.publish - ) + deps = repo.resolve_dependencies(image for image in repo if image.publish) deps.ensure(pre_build=lambda images: upload_debuginfo(repo, images)) set_build_status("success") annotate_buildkite_with_tags(repo.rd.arch, deps) diff --git a/misc/python/materialize/mzbuild.py b/misc/python/materialize/mzbuild.py index 2200188139219..08ca9bb43c943 100644 --- a/misc/python/materialize/mzbuild.py +++ b/misc/python/materialize/mzbuild.py @@ -473,13 +473,21 @@ def __init__(self, rd: RepositoryDetails, path: Path, config: dict[str, Any]): def run(self, prep: Any) -> None: super().run(prep) + source = Path(self.source) for src in self.inputs(): - dst = self.path / self.destination / src + rel = Path(src).relative_to(source) + dst = self.path / self.destination / rel dst.parent.mkdir(parents=True, exist_ok=True) - shutil.copy(self.rd.root / self.source / src, dst) + shutil.copy(self.rd.root / src, dst) def inputs(self) -> set[str]: - return set(git.expand_globs(self.rd.root / self.source, self.matching)) + # Return repo-root-relative paths so that `ResolvedImage.fingerprint` + # (which resolves each input as `rd.root / rel_path`) can lstat them. + source = Path(self.source) + return { + str(source / p) + for p in git.expand_globs(self.rd.root / self.source, self.matching) + } class CargoPreImage(PreImage): diff --git a/misc/python/test/antithesis/workload/Dockerfile b/misc/python/test/antithesis/workload/Dockerfile new file mode 100644 index 0000000000000..513a8d75b5a04 --- /dev/null +++ b/misc/python/test/antithesis/workload/Dockerfile @@ -0,0 +1,68 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +# Antithesis workload client for Materialize. +# +# Python-based test driver that connects to materialized via pgwire, +# produces Kafka messages, and emits Antithesis assertions. The +# parallel-workload driver reuses the real `materialize.parallel_workload` +# Python package — see mzbuild.yml for the pre-image copy of the slice it +# needs, and stubs/materialize/mzcompose/ for the docker-compose surface we +# have to mock out. + +FROM python:3.12-slim-bookworm + +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl \ + postgresql-client \ + && rm -rf /var/lib/apt/lists/* + +# `confluent-kafka[avro]` pulls fastavro, which `data_ingest.executor` +# imports at module top via `confluent_kafka.schema_registry.avro`. +# `pg8000`, `websocket-client`, `requests`, `xxhash`, `zstandard` cover the +# rest of the module-load-time imports walking from `parallel_workload` → +# `data_ingest` → `materialize.util`. +RUN pip install --no-cache-dir \ + psycopg[binary]==3.2.9 \ + "confluent-kafka[avro]==2.8.0" \ + antithesis==0.2.0 \ + PyMySQL==1.1.1 \ + pg8000==1.31.2 \ + websocket-client==1.8.0 \ + requests==2.32.3 \ + xxhash==3.5.0 \ + zstandard==0.23.0 + +# setup-complete script +COPY setup-complete.sh /usr/local/bin/setup-complete.sh +RUN chmod +x /usr/local/bin/setup-complete.sh + +# Test template directory — populated by antithesis-workload skill later +RUN mkdir -p /opt/antithesis/test/v1/materialize + +# Catalog directory for Python assertion cataloging +RUN mkdir -p /opt/antithesis/catalog + +# Ship the `materialize.*` Python package needed by the parallel-workload +# driver. Stubs are copied first so that the real parallel-workload code +# layered on top can satisfy its top-level `from materialize.mzcompose...` +# imports against tiny placeholders. `MZ_ROOT` is required by +# `materialize/__init__.py` at import time — point it at the package root. +COPY stubs/materialize/ /opt/antithesis-pkg/materialize/ +COPY materialize/ /opt/antithesis-pkg/materialize/ +ENV PYTHONPATH=/opt/antithesis-pkg +ENV MZ_ROOT=/opt/antithesis-pkg + +# Copy test templates and entrypoint +COPY test/ /opt/antithesis/test/v1/materialize/ +COPY workload-entrypoint.sh /usr/local/bin/workload-entrypoint.sh +RUN chmod +x /usr/local/bin/workload-entrypoint.sh +RUN chmod +x /opt/antithesis/test/v1/materialize/* 2>/dev/null || true + +ENTRYPOINT ["/usr/local/bin/workload-entrypoint.sh"] diff --git a/misc/python/test/antithesis/workload/mzbuild.yml b/misc/python/test/antithesis/workload/mzbuild.yml new file mode 100644 index 0000000000000..2d69faddfd065 --- /dev/null +++ b/misc/python/test/antithesis/workload/mzbuild.yml @@ -0,0 +1,39 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +name: antithesis-workload + +# The parallel-workload driver reuses the real `materialize.parallel_workload` +# Python package rather than reimplementing it. Copy the slice of +# `misc/python/materialize/` it needs into the build context so the Dockerfile +# can bundle it into the image. Everything in `materialize.mzcompose.*` is +# replaced by tiny stubs (see `stubs/materialize/mzcompose/`) — Antithesis +# injects faults at the container layer, so the workload container has no +# docker-compose orchestration to call into. +pre-image: + - type: copy + source: misc/python + destination: . + matching: materialize/__init__.py + - type: copy + source: misc/python + destination: . + matching: materialize/util.py + - type: copy + source: misc/python + destination: . + matching: materialize/sqlsmith.py + - type: copy + source: misc/python + destination: . + matching: materialize/parallel_workload + - type: copy + source: misc/python + destination: . + matching: materialize/data_ingest diff --git a/test/antithesis/workload/.gitignore b/test/antithesis/workload/.gitignore new file mode 100644 index 0000000000000..2c028d08d5e96 --- /dev/null +++ b/test/antithesis/workload/.gitignore @@ -0,0 +1,3 @@ +# Populated at image-build time by the `pre-image: type: copy` directives in +# mzbuild.yml — committing them would diverge from the source tree. +/materialize/ diff --git a/test/antithesis/workload/Dockerfile b/test/antithesis/workload/Dockerfile index 5cca619ed8234..513a8d75b5a04 100644 --- a/test/antithesis/workload/Dockerfile +++ b/test/antithesis/workload/Dockerfile @@ -10,7 +10,11 @@ # Antithesis workload client for Materialize. # # Python-based test driver that connects to materialized via pgwire, -# produces Kafka messages, and emits Antithesis assertions. +# produces Kafka messages, and emits Antithesis assertions. The +# parallel-workload driver reuses the real `materialize.parallel_workload` +# Python package — see mzbuild.yml for the pre-image copy of the slice it +# needs, and stubs/materialize/mzcompose/ for the docker-compose surface we +# have to mock out. FROM python:3.12-slim-bookworm @@ -19,11 +23,21 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ postgresql-client \ && rm -rf /var/lib/apt/lists/* +# `confluent-kafka[avro]` pulls fastavro, which `data_ingest.executor` +# imports at module top via `confluent_kafka.schema_registry.avro`. +# `pg8000`, `websocket-client`, `requests`, `xxhash`, `zstandard` cover the +# rest of the module-load-time imports walking from `parallel_workload` → +# `data_ingest` → `materialize.util`. RUN pip install --no-cache-dir \ psycopg[binary]==3.2.9 \ - confluent-kafka==2.8.0 \ + "confluent-kafka[avro]==2.8.0" \ antithesis==0.2.0 \ - PyMySQL==1.1.1 + PyMySQL==1.1.1 \ + pg8000==1.31.2 \ + websocket-client==1.8.0 \ + requests==2.32.3 \ + xxhash==3.5.0 \ + zstandard==0.23.0 # setup-complete script COPY setup-complete.sh /usr/local/bin/setup-complete.sh @@ -35,6 +49,16 @@ RUN mkdir -p /opt/antithesis/test/v1/materialize # Catalog directory for Python assertion cataloging RUN mkdir -p /opt/antithesis/catalog +# Ship the `materialize.*` Python package needed by the parallel-workload +# driver. Stubs are copied first so that the real parallel-workload code +# layered on top can satisfy its top-level `from materialize.mzcompose...` +# imports against tiny placeholders. `MZ_ROOT` is required by +# `materialize/__init__.py` at import time — point it at the package root. +COPY stubs/materialize/ /opt/antithesis-pkg/materialize/ +COPY materialize/ /opt/antithesis-pkg/materialize/ +ENV PYTHONPATH=/opt/antithesis-pkg +ENV MZ_ROOT=/opt/antithesis-pkg + # Copy test templates and entrypoint COPY test/ /opt/antithesis/test/v1/materialize/ COPY workload-entrypoint.sh /usr/local/bin/workload-entrypoint.sh diff --git a/test/antithesis/workload/mzbuild.yml b/test/antithesis/workload/mzbuild.yml index f62b4c073bb00..b957b4f8a2046 100644 --- a/test/antithesis/workload/mzbuild.yml +++ b/test/antithesis/workload/mzbuild.yml @@ -8,3 +8,30 @@ # by the Apache License, Version 2.0. name: antithesis-workload + +# The parallel-workload driver reuses the real `materialize.parallel_workload` +# Python package rather than reimplementing it. Copy the needed slice of +# `misc/python/materialize/` into the build context so the Dockerfile can +# bundle it into the image. The list is intentionally narrow — everything +# else is mocked out by the stubs in `stubs/materialize/mzcompose/...`. +pre-image: + - type: copy + source: misc/python + destination: . + matching: materialize/__init__.py + - type: copy + source: misc/python + destination: . + matching: materialize/util.py + - type: copy + source: misc/python + destination: . + matching: materialize/sqlsmith.py + - type: copy + source: misc/python + destination: . + matching: materialize/parallel_workload/** + - type: copy + source: misc/python + destination: . + matching: materialize/data_ingest/** diff --git a/test/antithesis/workload/stubs/materialize/mzcompose/__init__.py b/test/antithesis/workload/stubs/materialize/mzcompose/__init__.py new file mode 100644 index 0000000000000..4896a7d403416 --- /dev/null +++ b/test/antithesis/workload/stubs/materialize/mzcompose/__init__.py @@ -0,0 +1,37 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Stub of `materialize.mzcompose` for the Antithesis workload image. + +`materialize.parallel_workload` and `materialize.data_ingest` import +`materialize.mzcompose` symbols at module load time even on code paths that +don't actually run a docker-compose harness. The Antithesis workload image is +a slim Python container with no docker/mzbuild toolchain, so we ship these +stubs in its `PYTHONPATH` to satisfy the imports. Only attributes the +parallel-workload driver hits at module top are provided; anything called at +runtime in this environment would be a bug. +""" + +from __future__ import annotations + +from typing import Any + + +def get_default_system_parameters() -> dict[str, str]: + return {} + + +cluster_replica_size_map: dict[str, Any] = {} + + +class _LoaderModule: + pass + + +loader = _LoaderModule() diff --git a/test/antithesis/workload/stubs/materialize/mzcompose/composition.py b/test/antithesis/workload/stubs/materialize/mzcompose/composition.py new file mode 100644 index 0000000000000..4e0fff97fbbcd --- /dev/null +++ b/test/antithesis/workload/stubs/materialize/mzcompose/composition.py @@ -0,0 +1,31 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Stub of `materialize.mzcompose.composition`. See package __init__.py.""" + +from __future__ import annotations + +from typing import Any + + +class Composition: + """Placeholder type so that `Composition | None` annotations resolve. + + Every code path in `parallel_workload` that calls methods on a Composition + is gated on `Scenario.{Kill,BackupRestore,ZeroDowntimeDeploy}` — none of + which the Antithesis driver selects. Instantiating one in this + environment is a programming error. + """ + + def __init__(self, *args: Any, **kwargs: Any) -> None: + raise RuntimeError( + "materialize.mzcompose.composition.Composition is stubbed in the " + "Antithesis workload image; Antithesis injects faults at the " + "container layer, so docker-compose orchestration is unavailable." + ) diff --git a/test/antithesis/workload/stubs/materialize/mzcompose/helpers/__init__.py b/test/antithesis/workload/stubs/materialize/mzcompose/helpers/__init__.py new file mode 100644 index 0000000000000..caae679255ee1 --- /dev/null +++ b/test/antithesis/workload/stubs/materialize/mzcompose/helpers/__init__.py @@ -0,0 +1,8 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. diff --git a/test/antithesis/workload/stubs/materialize/mzcompose/helpers/iceberg.py b/test/antithesis/workload/stubs/materialize/mzcompose/helpers/iceberg.py new file mode 100644 index 0000000000000..eddc6d93231e5 --- /dev/null +++ b/test/antithesis/workload/stubs/materialize/mzcompose/helpers/iceberg.py @@ -0,0 +1,24 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Stub of `materialize.mzcompose.helpers.iceberg`. See package __init__.py.""" + +from __future__ import annotations + +from typing import Any + + +def setup_polaris_for_iceberg(c: Any, *args: Any, **kwargs: Any) -> tuple[str, str]: + # `Database.create` calls this unconditionally. The Antithesis topology + # does not run Polaris; the driver overrides `Database.create` to skip the + # iceberg connection setup, so this function should never be reached. + raise RuntimeError( + "setup_polaris_for_iceberg() stub: iceberg sinks are not supported " + "inside the Antithesis workload container." + ) diff --git a/test/antithesis/workload/stubs/materialize/mzcompose/services/__init__.py b/test/antithesis/workload/stubs/materialize/mzcompose/services/__init__.py new file mode 100644 index 0000000000000..caae679255ee1 --- /dev/null +++ b/test/antithesis/workload/stubs/materialize/mzcompose/services/__init__.py @@ -0,0 +1,8 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. diff --git a/test/antithesis/workload/stubs/materialize/mzcompose/services/materialized.py b/test/antithesis/workload/stubs/materialize/mzcompose/services/materialized.py new file mode 100644 index 0000000000000..a7d436724ace6 --- /dev/null +++ b/test/antithesis/workload/stubs/materialize/mzcompose/services/materialized.py @@ -0,0 +1,33 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Stub of `materialize.mzcompose.services.materialized`. See package +__init__.py for context.""" + +from __future__ import annotations + +from enum import Enum +from typing import Any + +LEADER_STATUS_HEALTHCHECK: list[str] = [] + + +class DeploymentStatus(Enum): + READY_TO_PROMOTE = "ready_to_promote" + IS_LEADER = "is_leader" + + +class Materialized: + """Placeholder; only instantiated by `ZeroDowntimeDeployAction`.""" + + def __init__(self, *args: Any, **kwargs: Any) -> None: + raise RuntimeError( + "Materialized service stub: zero-downtime-deploy is not " + "supported inside the Antithesis workload container." + ) diff --git a/test/antithesis/workload/stubs/materialize/mzcompose/services/minio.py b/test/antithesis/workload/stubs/materialize/mzcompose/services/minio.py new file mode 100644 index 0000000000000..07ee119c96d48 --- /dev/null +++ b/test/antithesis/workload/stubs/materialize/mzcompose/services/minio.py @@ -0,0 +1,21 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Stub of `materialize.mzcompose.services.minio`. See package __init__.py.""" + +from __future__ import annotations + + +def minio_blob_uri() -> str: + # Only referenced from BackupRestoreAction, which the Antithesis driver + # never schedules. + raise RuntimeError( + "minio_blob_uri() stub: BackupRestore scenario is not supported " + "inside the Antithesis workload container." + ) diff --git a/test/antithesis/workload/stubs/materialize/mzcompose/services/mysql.py b/test/antithesis/workload/stubs/materialize/mzcompose/services/mysql.py new file mode 100644 index 0000000000000..1aeb60be61c16 --- /dev/null +++ b/test/antithesis/workload/stubs/materialize/mzcompose/services/mysql.py @@ -0,0 +1,22 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Stub of `materialize.mzcompose.services.mysql`. See package __init__.py. + +Only the `DEFAULT_ROOT_PASSWORD` class attribute is read at runtime — the +constant must match the real `MySql` service so the parallel-workload's +`CREATE SECRET mypass AS ...` matches the actual MySQL container password +provisioned by the Antithesis topology. +""" + +from __future__ import annotations + + +class MySql: + DEFAULT_ROOT_PASSWORD = "p@ssw0rd" diff --git a/test/antithesis/workload/stubs/materialize/mzcompose/services/sql_server.py b/test/antithesis/workload/stubs/materialize/mzcompose/services/sql_server.py new file mode 100644 index 0000000000000..023cde11b9bbd --- /dev/null +++ b/test/antithesis/workload/stubs/materialize/mzcompose/services/sql_server.py @@ -0,0 +1,24 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Stub of `materialize.mzcompose.services.sql_server`. See package __init__.py. + +Constants kept in sync with the real `SqlServer` service so any SQL emitted +referring to them stays well-formed. The Antithesis topology doesn't actually +include a sql-server container — driver code avoids +`CreateSqlServerSourceAction` and overrides the connection setup in +`Database.create` accordingly. +""" + +from __future__ import annotations + + +class SqlServer: + DEFAULT_USER = "SA" + DEFAULT_SA_PASSWORD = "RPSsql12345" diff --git a/test/antithesis/workload/test/anytime_fault_recovery_exercised.py b/test/antithesis/workload/test/anytime_fault_recovery_exercised.py index 143dd8c103dce..ff90867b0b6f5 100755 --- a/test/antithesis/workload/test/anytime_fault_recovery_exercised.py +++ b/test/antithesis/workload/test/anytime_fault_recovery_exercised.py @@ -45,6 +45,7 @@ import time import psycopg +from antithesis.assertions import sometimes from helper_pg import ( PGDATABASE, PGHOST, @@ -53,8 +54,6 @@ query_one_retry, ) -from antithesis.assertions import sometimes - logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" ) @@ -75,14 +74,17 @@ def _probe_select_one() -> bool: the recovery transition we are looking for. """ try: - with psycopg.connect( - host=PGHOST, - port=PGPORT, - user=PGUSER, - dbname=PGDATABASE, - connect_timeout=int(PROBE_CONNECT_TIMEOUT_S), - autocommit=True, - ) as conn, conn.cursor() as cur: + with ( + psycopg.connect( + host=PGHOST, + port=PGPORT, + user=PGUSER, + dbname=PGDATABASE, + connect_timeout=int(PROBE_CONNECT_TIMEOUT_S), + autocommit=True, + ) as conn, + conn.cursor() as cur, + ): cur.execute("SELECT 1") row = cur.fetchone() return row is not None and row[0] == 1 diff --git a/test/antithesis/workload/test/anytime_kafka_frontier_monotonic.py b/test/antithesis/workload/test/anytime_kafka_frontier_monotonic.py index faee0fd0c680e..efd906a725844 100755 --- a/test/antithesis/workload/test/anytime_kafka_frontier_monotonic.py +++ b/test/antithesis/workload/test/anytime_kafka_frontier_monotonic.py @@ -42,11 +42,10 @@ import sys import time +from antithesis.assertions import always from helper_pg import query_retry from helper_source_stats import offset_committed -from antithesis.assertions import always - logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" ) diff --git a/test/antithesis/workload/test/anytime_kafka_offset_known_not_below_committed.py b/test/antithesis/workload/test/anytime_kafka_offset_known_not_below_committed.py index 9801c4dfa65b7..a8d6be62ae6a9 100755 --- a/test/antithesis/workload/test/anytime_kafka_offset_known_not_below_committed.py +++ b/test/antithesis/workload/test/anytime_kafka_offset_known_not_below_committed.py @@ -40,9 +40,8 @@ import sys import time -from helper_pg import query_retry - from antithesis.assertions import always +from helper_pg import query_retry logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" diff --git a/test/antithesis/workload/test/anytime_kafka_source_resumes_after_fault.py b/test/antithesis/workload/test/anytime_kafka_source_resumes_after_fault.py index 85042a317d7cb..b453f62631aac 100755 --- a/test/antithesis/workload/test/anytime_kafka_source_resumes_after_fault.py +++ b/test/antithesis/workload/test/anytime_kafka_source_resumes_after_fault.py @@ -50,9 +50,8 @@ import sys import time -from helper_pg import query_one_retry, query_retry - from antithesis.assertions import reachable, sometimes +from helper_pg import query_one_retry, query_retry logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" diff --git a/test/antithesis/workload/test/first_mysql_replica_setup.py b/test/antithesis/workload/test/first_mysql_replica_setup.py index 4380b5f4bd40d..ee603e60e88d6 100644 --- a/test/antithesis/workload/test/first_mysql_replica_setup.py +++ b/test/antithesis/workload/test/first_mysql_replica_setup.py @@ -30,9 +30,8 @@ import time import helper_mysql -from helper_mysql_source import ensure_mysql_cdc_source - from antithesis.assertions import reachable, sometimes +from helper_mysql_source import ensure_mysql_cdc_source logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" diff --git a/test/antithesis/workload/test/first_select_upsert_implementation.py b/test/antithesis/workload/test/first_select_upsert_implementation.py index 03394a1ebd7f7..584f40da7812c 100755 --- a/test/antithesis/workload/test/first_select_upsert_implementation.py +++ b/test/antithesis/workload/test/first_select_upsert_implementation.py @@ -29,9 +29,8 @@ import sys import helper_random -from helper_pg import execute_internal_retry - from antithesis.assertions import sometimes +from helper_pg import execute_internal_retry logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" diff --git a/test/antithesis/workload/test/helper_mysql.py b/test/antithesis/workload/test/helper_mysql.py index e99b3656cb4dd..f9b79395c556a 100644 --- a/test/antithesis/workload/test/helper_mysql.py +++ b/test/antithesis/workload/test/helper_mysql.py @@ -36,7 +36,7 @@ def _retryable(exc: BaseException) -> bool: - return isinstance(exc, (pymysql.OperationalError, pymysql.InterfaceError)) + return isinstance(exc, pymysql.OperationalError | pymysql.InterfaceError) def _open(host: str, database: str) -> pymysql.connections.Connection: @@ -116,16 +116,12 @@ def execute_replica(sql: str, params: tuple = (), database: str = "mysql") -> No _execute(MYSQL_REPLICA_HOST, sql, params, database) -def query_primary( - sql: str, params: tuple = (), database: str = "mysql" -) -> list[tuple]: +def query_primary(sql: str, params: tuple = (), database: str = "mysql") -> list[tuple]: """Run a query on the MySQL primary and return all rows.""" return _query(MYSQL_HOST, sql, params, database) -def query_replica( - sql: str, params: tuple = (), database: str = "mysql" -) -> list[tuple]: +def query_replica(sql: str, params: tuple = (), database: str = "mysql") -> list[tuple]: """Run a query on the MySQL replica and return all rows.""" return _query(MYSQL_REPLICA_HOST, sql, params, database) diff --git a/test/antithesis/workload/test/helper_mysql_source.py b/test/antithesis/workload/test/helper_mysql_source.py index 6572eddc9c7e4..34323a846faed 100644 --- a/test/antithesis/workload/test/helper_mysql_source.py +++ b/test/antithesis/workload/test/helper_mysql_source.py @@ -28,7 +28,6 @@ import os import psycopg - from helper_pg import create_source_idempotent, execute_retry, query_retry LOG = logging.getLogger("antithesis.helper_mysql_source") @@ -48,9 +47,7 @@ def ensure_mysql_connection() -> None: """Create the MySQL secret and connection in Materialize (idempotent).""" - execute_retry( - f"CREATE SECRET IF NOT EXISTS {SECRET_NAME} AS '{MYSQL_PASSWORD}'" - ) + execute_retry(f"CREATE SECRET IF NOT EXISTS {SECRET_NAME} AS '{MYSQL_PASSWORD}'") execute_retry( f"CREATE CONNECTION IF NOT EXISTS {CONNECTION_NAME} TO MYSQL (" f"HOST '{MYSQL_REPLICA_HOST}', " @@ -58,7 +55,9 @@ def ensure_mysql_connection() -> None: f"PASSWORD SECRET {SECRET_NAME}" f")" ) - LOG.info("mysql connection %s ready (replica=%s)", CONNECTION_NAME, MYSQL_REPLICA_HOST) + LOG.info( + "mysql connection %s ready (replica=%s)", CONNECTION_NAME, MYSQL_REPLICA_HOST + ) def ensure_mysql_cdc_table() -> None: diff --git a/test/antithesis/workload/test/helper_pg.py b/test/antithesis/workload/test/helper_pg.py index 5c74276fe5f90..e3508c4f44b4a 100644 --- a/test/antithesis/workload/test/helper_pg.py +++ b/test/antithesis/workload/test/helper_pg.py @@ -142,14 +142,17 @@ def execute_internal_retry(sql: str, params: Sequence[Any] | None = None) -> Non backoff = _RETRY_INITIAL_S while True: try: - with psycopg.connect( - host=PGHOST, - port=PGPORT_INTERNAL, - user=PGUSER_INTERNAL, - dbname=PGDATABASE, - connect_timeout=_CONNECT_TIMEOUT_S, - autocommit=True, - ) as conn, conn.cursor() as cur: + with ( + psycopg.connect( + host=PGHOST, + port=PGPORT_INTERNAL, + user=PGUSER_INTERNAL, + dbname=PGDATABASE, + connect_timeout=_CONNECT_TIMEOUT_S, + autocommit=True, + ) as conn, + conn.cursor() as cur, + ): cur.execute(sql, params or ()) return except Exception as exc: # noqa: BLE001 diff --git a/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py b/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py index 9c3c0e2461cbe..3c1a4e1359793 100755 --- a/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py +++ b/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py @@ -45,6 +45,7 @@ import sys import helper_random +from antithesis.assertions import always, sometimes from helper_kafka import make_producer from helper_none_source import ( SOURCE_NONE_TEXT, @@ -55,8 +56,6 @@ from helper_quiet import request_quiet_period from helper_source_stats import wait_for_catchup -from antithesis.assertions import always, sometimes - logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" ) diff --git a/test/antithesis/workload/test/parallel_driver_mv_reflects_table_updates.py b/test/antithesis/workload/test/parallel_driver_mv_reflects_table_updates.py index c026be09ea522..876f5ff5a8e5e 100755 --- a/test/antithesis/workload/test/parallel_driver_mv_reflects_table_updates.py +++ b/test/antithesis/workload/test/parallel_driver_mv_reflects_table_updates.py @@ -45,12 +45,11 @@ import time import helper_random +from antithesis.assertions import always, sometimes from helper_pg import execute_retry, query_one_retry from helper_quiet import request_quiet_period from helper_table_mv import MV_NAME, TABLE_MV_INPUT, ensure_table_and_mv -from antithesis.assertions import always, sometimes - logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" ) diff --git a/test/antithesis/workload/test/parallel_driver_mysql_cdc.py b/test/antithesis/workload/test/parallel_driver_mysql_cdc.py index 67a9627e1e386..c51330251bad8 100644 --- a/test/antithesis/workload/test/parallel_driver_mysql_cdc.py +++ b/test/antithesis/workload/test/parallel_driver_mysql_cdc.py @@ -39,12 +39,11 @@ import helper_mysql import helper_random +from antithesis.assertions import always, sometimes from helper_mysql_source import SOURCE_NAME, TABLE_NAME from helper_pg import query_retry from helper_quiet import request_quiet_period -from antithesis.assertions import always, sometimes - logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" ) diff --git a/test/antithesis/workload/test/parallel_driver_parallel_workload.py b/test/antithesis/workload/test/parallel_driver_parallel_workload.py index 46ece1c308341..1c8dadf8f641d 100644 --- a/test/antithesis/workload/test/parallel_driver_parallel_workload.py +++ b/test/antithesis/workload/test/parallel_driver_parallel_workload.py @@ -8,23 +8,32 @@ # the Business Source License, use of this software will be governed # by the Apache License, Version 2.0. -"""Antithesis-native randomized parallel SQL workload. - -This ports the *intent* of `test/parallel-workload/mzcompose.py` into the -existing Antithesis workload model without trying to ship the whole -`materialize.parallel_workload` Python stack inside the workload image. - -The driver deliberately shares a small fixed pool of objects across all -invocations and worker threads: - - one schema - - four tables - - four materialized views over those tables - -Workers race CREATE/DROP/INSERT/UPDATE/DELETE/SELECT against that pool. The -property is not result correctness; it is that concurrent randomized SQL under -fault injection should not surface *unexpected* query errors. Expected catalog -race/drop errors are counted and ignored, mirroring the philosophy of the -original parallel workload. +"""Antithesis driver wrapping the real `materialize.parallel_workload`. + +Earlier versions of this file reimplemented the *idea* of parallel-workload +(a fixed pool of objects, worker threads racing CREATE/DROP/INSERT/etc.). +That diverged from the canonical stress driver and forced us to rederive the +catalog-race error catalog by hand. This module instead bundles the real +`materialize.parallel_workload` package into the workload image (see +`mzbuild.yml` + `Dockerfile`) and invokes its `Worker`, `Action`, +`ActionList`, and `Database` classes directly. + +A few pieces of upstream's `parallel_workload.run()` orchestration don't +translate to the Antithesis topology: + + * Faults are injected at the container layer by Antithesis itself, so we + don't spawn `KillAction`/`BackupRestoreAction`/`ZeroDowntimeDeployAction` + worker threads. We still tag the database with `Scenario.Kill` so each + `Action.errors_to_ignore` includes connection-shaped errors — those are + expected here. + * `Database.create` unconditionally calls `setup_polaris_for_iceberg(...)` + and creates `postgres_conn` / `sql_server_conn` against services that + aren't in the Antithesis compose. We override `create` to skip that + setup and only wire up the kafka + minio connections the topology + actually has. + * `parallel_workload.run()` tunes a long list of `ALTER SYSTEM SET` knobs + and recreates the `quickstart` cluster. We skip the recreate (would + fight with `antithesis_cluster`) and apply only the size-limit knobs. """ from __future__ import annotations @@ -35,332 +44,350 @@ import sys import threading import time -from collections import Counter -from dataclasses import dataclass, field from typing import Any import helper_random import psycopg -from helper_pg import PGDATABASE, PGHOST, PGPORT, PGUSER, execute_retry - from antithesis.assertions import always, sometimes +from helper_pg import ( + PGDATABASE, + PGHOST, + PGPORT, + PGPORT_INTERNAL, + PGUSER, + PGUSER_INTERNAL, +) + +from materialize.parallel_workload.action import ( + ddl_action_list, + dml_nontrans_action_list, + fetch_action_list, + read_action_list, + write_action_list, +) +from materialize.parallel_workload.database import ( + MAX_CLUSTER_REPLICAS, + MAX_CLUSTERS, + MAX_KAFKA_SINKS, + MAX_KAFKA_SOURCES, + MAX_POSTGRES_SOURCES, + MAX_ROLES, + MAX_SCHEMAS, + MAX_TABLES, + MAX_VIEWS, + MAX_WEBHOOK_SOURCES, + Database, +) +from materialize.parallel_workload.executor import Executor +from materialize.parallel_workload.settings import Complexity, Scenario +from materialize.parallel_workload.worker import Worker +from materialize.parallel_workload.worker_exception import WorkerFailedException logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" ) LOG = logging.getLogger("driver.parallel_workload") -CLUSTER = os.environ.get("MZ_ANTITHESIS_CLUSTER", "antithesis_cluster") -SCHEMA = "antithesis_parallel_workload" - -TABLE_COUNT = 4 -WORKER_THREADS = 4 -RUNTIME_S = 25.0 -CONNECT_TIMEOUT_S = 5 -MAX_KEY = 31 -MAX_VALUE = 1000 - -EXPECTED_ERROR_SUBSTRINGS = [ - "already exists", - "does not exist", - "unknown catalog item", - "unknown schema", - "was dropped while executing a statement", - "another session modified the catalog while this DDL transaction was open", - "object state changed while transaction was in progress", - "query could not complete", - "cached plan must not change result type", - "the transaction's active cluster has been dropped", - "concurrent transaction", -] - - -@dataclass -class WorkerStats: - successes: int = 0 - reconnects: int = 0 - ignored_errors: int = 0 - actions: Counter[str] = field(default_factory=Counter) - ignored_by_reason: Counter[str] = field(default_factory=Counter) - unexpected: dict[str, Any] | None = None - - -def table_name(idx: int) -> str: - return f"{SCHEMA}.t{idx}" - - -def mv_name(idx: int) -> str: - return f"{SCHEMA}.mv{idx}" - - -def ensure_shared_objects() -> None: - execute_retry(f"CREATE SCHEMA IF NOT EXISTS {SCHEMA}") - for idx in range(2): - execute_retry( - f"CREATE TABLE IF NOT EXISTS {table_name(idx)} (" - "worker TEXT NOT NULL, " - "k BIGINT NOT NULL, " - "v BIGINT NOT NULL" - ")" +# Antithesis Test Composer invokes drivers in tight loops, so this script is +# intentionally short. The cap exists so a single iteration can't monopolise +# the fault-injection budget; the goal is repeated short bursts. +RUNTIME_S = float(os.environ.get("PW_RUNTIME_S", "20")) +NUM_THREADS = int(os.environ.get("PW_THREADS", "4")) + + +def _alter_system(cur: psycopg.Cursor[Any], stmt: str) -> None: + try: + cur.execute(stmt.encode()) + except Exception as exc: # noqa: BLE001 + LOG.warning("ALTER SYSTEM tolerated: %s (%s)", stmt, exc) + + +def _prepare_system(num_threads: int) -> None: + """Apply the catalog-size knobs from `parallel_workload.run()` so the + workload doesn't trip default limits. The privilege grants mirror upstream + so most queries don't fail on permissions. Idempotent across drivers.""" + with ( + psycopg.connect( + host=PGHOST, + port=PGPORT_INTERNAL, + user=PGUSER_INTERNAL, + dbname=PGDATABASE, + autocommit=True, + connect_timeout=15, + ) as conn, + conn.cursor() as cur, + ): + _alter_system( + cur, + f"ALTER SYSTEM SET max_schemas_per_database = {MAX_SCHEMAS * 40 + num_threads}", + ) + _alter_system( + cur, f"ALTER SYSTEM SET max_tables = {MAX_TABLES * 40 + num_threads}" + ) + _alter_system( + cur, + f"ALTER SYSTEM SET max_materialized_views = {MAX_VIEWS * 40 + num_threads}", + ) + _alter_system( + cur, + f"ALTER SYSTEM SET max_sources = " + f"{(MAX_WEBHOOK_SOURCES + MAX_KAFKA_SOURCES + MAX_POSTGRES_SOURCES) * 40 + num_threads}", + ) + _alter_system( + cur, f"ALTER SYSTEM SET max_sinks = {MAX_KAFKA_SINKS * 40 + num_threads}" + ) + _alter_system( + cur, f"ALTER SYSTEM SET max_roles = {MAX_ROLES * 1000 + num_threads}" + ) + _alter_system( + cur, f"ALTER SYSTEM SET max_clusters = {MAX_CLUSTERS * 40 + num_threads}" ) + _alter_system( + cur, + f"ALTER SYSTEM SET max_replicas_per_cluster = " + f"{MAX_CLUSTER_REPLICAS * 40 + num_threads}", + ) + _alter_system(cur, "ALTER SYSTEM SET max_secrets = 1000000") + _alter_system(cur, "ALTER SYSTEM SET idle_in_transaction_session_timeout = 0") + for object_type in ( + "TABLES", + "TYPES", + "SECRETS", + "CONNECTIONS", + "DATABASES", + "SCHEMAS", + "CLUSTERS", + ): + _alter_system( + cur, + f"ALTER DEFAULT PRIVILEGES FOR ALL ROLES " + f"GRANT ALL PRIVILEGES ON {object_type} TO PUBLIC", + ) -def connect() -> psycopg.Connection[Any]: - return psycopg.connect( - host=PGHOST, - port=PGPORT, - user=PGUSER, - dbname=PGDATABASE, - connect_timeout=CONNECT_TIMEOUT_S, - autocommit=True, - ) +def _create_database_for_antithesis(database: Database, exe: Executor) -> None: + """Stand-in for `Database.create` that only sets up connections matching + the Antithesis topology. Upstream's `create()` also wires polaris, + sql-server, and an external postgres source — none of those are running + in this compose.""" + from pg8000.native import identifier + for db in database.dbs: + db.drop(exe) + db.create(exe) -def choose_action(rng: random.Random) -> str: - return rng.choices( - [ - "create_table", - "drop_table", - "insert", - "update", - "delete", - "select_table", - "create_mv", - "drop_mv", - "select_mv", - ], - weights=[6, 2, 25, 12, 10, 20, 6, 2, 17], - k=1, - )[0] - - -def execute_action( - conn: psycopg.Connection[Any], rng: random.Random, worker_name: str, action: str -) -> None: - idx = rng.randrange(TABLE_COUNT) - table = table_name(idx) - mv = mv_name(idx) - - with conn.cursor() as cur: - if action == "create_table": - cur.execute( - f"CREATE TABLE IF NOT EXISTS {table} (" - "worker TEXT NOT NULL, " - "k BIGINT NOT NULL, " - "v BIGINT NOT NULL" - ")" - ) - elif action == "drop_table": - cur.execute(f"DROP TABLE IF EXISTS {table} CASCADE") - elif action == "insert": - cur.execute( - f"INSERT INTO {table} (worker, k, v) VALUES (%s, %s, %s)", - ( - worker_name, - rng.randint(0, MAX_KEY), - rng.randint(0, MAX_VALUE), - ), - ) - elif action == "update": - cur.execute( - f"UPDATE {table} SET v = v + 1 WHERE k = %s", - (rng.randint(0, MAX_KEY),), - ) - elif action == "delete": - cur.execute( - f"DELETE FROM {table} WHERE k = %s", - (rng.randint(0, MAX_KEY),), - ) - elif action == "select_table": - cur.execute( - f"SELECT count(*)::bigint, min(v)::bigint, max(v)::bigint FROM {table}" - ) - cur.fetchall() - elif action == "create_mv": - cur.execute( - f"CREATE MATERIALIZED VIEW IF NOT EXISTS {mv} " - f"IN CLUSTER {CLUSTER} AS " - f"SELECT worker, count(*)::bigint AS c, sum(v)::bigint AS s " - f"FROM {table} GROUP BY worker" - ) - elif action == "drop_mv": - cur.execute(f"DROP MATERIALIZED VIEW IF EXISTS {mv}") - elif action == "select_mv": - cur.execute( - f"SELECT count(*)::bigint, sum(c)::bigint, sum(s)::bigint FROM {mv}" - ) - cur.fetchall() - else: - raise ValueError(f"unknown action {action}") + exe.execute("SELECT name FROM mz_clusters WHERE name LIKE 'c%'") + for row in exe.cur.fetchall(): + exe.execute(f"DROP CLUSTER {identifier(row[0])} CASCADE") + exe.execute("DROP SECRET IF EXISTS minio CASCADE") + exe.execute("DROP CONNECTION IF EXISTS aws_conn CASCADE") + exe.execute("DROP CONNECTION IF EXISTS kafka_conn CASCADE") + exe.execute("DROP CONNECTION IF EXISTS csr_conn CASCADE") -def expected_error_reason(exc: BaseException) -> str | None: - msg = str(exc) - for candidate in EXPECTED_ERROR_SUBSTRINGS: - if candidate in msg: - return candidate - return None + exe.execute("SELECT name FROM mz_roles WHERE name LIKE 'r%'") + for row in exe.cur.fetchall(): + exe.execute(f"DROP ROLE {identifier(row[0])}") + exe.execute( + "CREATE CONNECTION IF NOT EXISTS kafka_conn FOR KAFKA " + "BROKER 'kafka:9092', SECURITY PROTOCOL PLAINTEXT" + ) + exe.execute( + "CREATE CONNECTION IF NOT EXISTS csr_conn FOR CONFLUENT SCHEMA " + "REGISTRY URL 'http://schema-registry:8081'" + ) + exe.execute("CREATE SECRET IF NOT EXISTS minio AS 'minioadmin'") + exe.execute( + "CREATE CONNECTION IF NOT EXISTS aws_conn TO AWS (" + "ENDPOINT 'http://minio:9000/', REGION 'minio', " + "ACCESS KEY ID 'minioadmin', SECRET ACCESS KEY SECRET minio)" + ) -def is_connection_error(exc: BaseException) -> bool: - return isinstance(exc, (psycopg.OperationalError, psycopg.InterfaceError)) + for relation in database: + relation.create(exe) + + +def _spawn_workers( + rng: random.Random, + database: Database, + end_time: float, + num_threads: int, +) -> tuple[list[Worker], list[threading.Thread]]: + """Build the same thread pool `parallel_workload.run()` does for + `Complexity.DDL`, minus the per-scenario kill/cancel/backup helper.""" + weights = [60, 30, 30, 30, 100] + workers: list[Worker] = [] + threads: list[threading.Thread] = [] + for i in range(num_threads): + worker_rng = random.Random(rng.randrange(1_000_000)) + action_list = worker_rng.choices( + [ + read_action_list, + fetch_action_list, + write_action_list, + dml_nontrans_action_list, + ddl_action_list, + ], + weights, + )[0] + actions = [ + action_class(worker_rng, None) + for action_class in action_list.action_classes + ] + worker = Worker( + worker_rng, + actions, + action_list.weights, + end_time, + action_list.autocommit, + system=False, + composition=None, + action_list=action_list, + ) + workers.append(worker) + thread = threading.Thread( + name=f"pw-worker-{i}", + target=worker.run, + args=(PGHOST, PGPORT, 6876, PGUSER, database), + ) + thread.start() + threads.append(thread) + return workers, threads -def run_worker( - worker_id: int, - seed: int, - deadline: float, - stop: threading.Event, - stats: WorkerStats, -) -> None: +def main() -> int: + seed = str(helper_random.random_u64()) rng = random.Random(seed) - worker_name = f"pw{worker_id}" - conn: psycopg.Connection[Any] | None = None - try: - while time.monotonic() < deadline and not stop.is_set(): - if conn is None or conn.closed: - try: - conn = connect() - except Exception as exc: # noqa: BLE001 - if not is_connection_error(exc): - stats.unexpected = { - "worker": worker_name, - "action": "connect", - "error": str(exc), - } - stop.set() - return - stats.reconnects += 1 - time.sleep(rng.uniform(0.05, 0.2)) - continue - - action = choose_action(rng) - try: - execute_action(conn, rng, worker_name, action) - stats.successes += 1 - stats.actions[action] += 1 - except Exception as exc: # noqa: BLE001 - if is_connection_error(exc): - stats.reconnects += 1 - try: - conn.close() - except Exception: # noqa: BLE001 - pass - conn = None - continue - - reason = expected_error_reason(exc) - if reason is not None: - stats.ignored_errors += 1 - stats.ignored_by_reason[reason] += 1 - stats.actions[action] += 1 - continue - - stats.unexpected = { - "worker": worker_name, - "action": action, - "error": str(exc), - } - LOG.exception("unexpected parallel workload error") - stop.set() - return - - time.sleep(rng.uniform(0.005, 0.05)) - finally: - if conn is not None: - try: - conn.close() - except Exception: # noqa: BLE001 - pass + LOG.info( + "parallel-workload starting: seed=%s threads=%d runtime=%ss", + seed, + NUM_THREADS, + RUNTIME_S, + ) + _prepare_system(NUM_THREADS) -def main() -> int: - ensure_shared_objects() - - stop = threading.Event() - deadline = time.monotonic() + RUNTIME_S - seeds = [helper_random.random_u64() for _ in range(WORKER_THREADS)] - stats = [WorkerStats() for _ in range(WORKER_THREADS)] - threads = [ - threading.Thread( - name=f"parallel-workload-{idx}", - target=run_worker, - args=(idx, seeds[idx], deadline, stop, stats[idx]), - ) - for idx in range(WORKER_THREADS) - ] + # `Scenario.Kill` widens `Action.errors_to_ignore` to absorb connection + # drops, which mirrors what Antithesis container-pauses look like at the + # client. We never instantiate `KillAction` itself. + database = Database( + rng=rng, + seed=seed, + host=PGHOST, + ports={ + "materialized": PGPORT, + "mz_system": PGPORT_INTERNAL, + "http": 6876, + "kafka": 9092, + "schema-registry": 8081, + }, + complexity=Complexity.DDL, + scenario=Scenario.Kill, + naughty_identifiers=False, + ) - LOG.info("parallel workload starting; schema=%s threads=%d", SCHEMA, WORKER_THREADS) - for thread in threads: - thread.start() - for thread in threads: - thread.join() - - total_successes = sum(worker.successes for worker in stats) - total_reconnects = sum(worker.reconnects for worker in stats) - total_ignored = sum(worker.ignored_errors for worker in stats) - action_counts = Counter[str]() - ignored_by_reason = Counter[str]() - unexpected = next((worker.unexpected for worker in stats if worker.unexpected), None) - for worker in stats: - action_counts.update(worker.actions) - ignored_by_reason.update(worker.ignored_by_reason) + end_time = time.time() + RUNTIME_S - sometimes( - total_successes >= WORKER_THREADS * 5, - "parallel workload: randomized concurrent SQL executed successfully", - { - "successes": total_successes, - "threads": WORKER_THREADS, - "actions": dict(action_counts), - "reconnects": total_reconnects, - }, + setup_failure: Exception | None = None + try: + with ( + psycopg.connect( + host=PGHOST, + port=PGPORT, + user=PGUSER, + dbname=PGDATABASE, + autocommit=True, + connect_timeout=15, + ) as setup_conn, + setup_conn.cursor() as setup_cur, + ): + setup_exe = Executor(rng, setup_cur, None, database) + _create_database_for_antithesis(database, setup_exe) + except Exception as exc: # noqa: BLE001 + setup_failure = exc + LOG.exception("parallel-workload setup failed") + + workers: list[Worker] = [] + threads: list[threading.Thread] = [] + worker_failed: WorkerFailedException | None = None + if setup_failure is None: + workers, threads = _spawn_workers(rng, database, end_time, NUM_THREADS) + try: + while time.time() < end_time: + dead = [t for t in threads if not t.is_alive()] + if dead: + occurred = next( + (w.occurred_exception for w in workers if w.occurred_exception), + None, + ) + worker_failed = WorkerFailedException( + f"thread {dead[0].name} exited early", occurred + ) + for worker in workers: + worker.end_time = time.time() + break + time.sleep(0.5) + finally: + for worker in workers: + worker.end_time = time.time() + for thread in threads: + thread.join(timeout=30) + + total_queries = sum(w.num_queries.total() for w in workers) + total_ignored = sum( + count + for w in workers + for counter in w.ignored_errors.values() + for count in counter.values() ) + sometimes( - action_counts["create_table"] - + action_counts["drop_table"] - + action_counts["create_mv"] - + action_counts["drop_mv"] - > 0, - "parallel workload: DDL actions were exercised", + total_queries >= NUM_THREADS, + "parallel workload: randomized concurrent SQL executed successfully", { - "create_table": action_counts["create_table"], - "drop_table": action_counts["drop_table"], - "create_mv": action_counts["create_mv"], - "drop_mv": action_counts["drop_mv"], + "queries": total_queries, + "threads": NUM_THREADS, + "ignored_errors": total_ignored, }, ) sometimes( total_ignored > 0, "parallel workload: expected concurrent-catalog races were observed", - { - "ignored_errors": total_ignored, - "ignored_by_reason": dict(ignored_by_reason), - }, + {"ignored_errors": total_ignored}, ) + + unexpected = None + if setup_failure is not None: + unexpected = {"phase": "setup", "error": str(setup_failure)} + elif worker_failed is not None: + unexpected = { + "phase": "worker", + "error": ( + str(worker_failed.cause) if worker_failed.cause else str(worker_failed) + ), + } + always( unexpected is None, "parallel workload: no unexpected SQL errors escaped the randomized stress driver", { "unexpected": unexpected, - "successes": total_successes, + "queries": total_queries, "ignored_errors": total_ignored, - "reconnects": total_reconnects, - "actions": dict(action_counts), + "threads": NUM_THREADS, }, ) LOG.info( - "parallel workload done; successes=%d ignored=%d reconnects=%d unexpected=%s", - total_successes, + "parallel-workload done: queries=%d ignored=%d unexpected=%s", + total_queries, total_ignored, - total_reconnects, unexpected, ) return 1 if unexpected is not None else 0 if __name__ == "__main__": - _ = (PGHOST, PGPORT, PGUSER, PGDATABASE, os) sys.exit(main()) diff --git a/test/antithesis/workload/test/parallel_driver_strict_serializable_reads.py b/test/antithesis/workload/test/parallel_driver_strict_serializable_reads.py index c4af73b434635..19e7d1d698dbc 100755 --- a/test/antithesis/workload/test/parallel_driver_strict_serializable_reads.py +++ b/test/antithesis/workload/test/parallel_driver_strict_serializable_reads.py @@ -57,6 +57,7 @@ import helper_random import psycopg +from antithesis.assertions import always, sometimes from helper_pg import ( PGDATABASE, PGHOST, @@ -67,8 +68,6 @@ from helper_quiet import request_quiet_period from helper_table_mv import MV_NAME, TABLE_MV_INPUT, ensure_table_and_mv -from antithesis.assertions import always, sometimes - logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" ) @@ -91,14 +90,17 @@ def _fresh_select_count(prefix: str) -> int | None: but defends against future changes to the system default. """ try: - with psycopg.connect( - host=PGHOST, - port=PGPORT, - user=PGUSER, - dbname=PGDATABASE, - connect_timeout=PROBE_CONNECT_TIMEOUT_S, - autocommit=True, - ) as conn, conn.cursor() as cur: + with ( + psycopg.connect( + host=PGHOST, + port=PGPORT, + user=PGUSER, + dbname=PGDATABASE, + connect_timeout=PROBE_CONNECT_TIMEOUT_S, + autocommit=True, + ) as conn, + conn.cursor() as cur, + ): cur.execute("SET transaction_isolation TO 'strict serializable'") cur.execute( f"SELECT row_count::bigint FROM {MV_NAME} WHERE prefix = %s", diff --git a/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py b/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py index 066620aaf6ded..b58c15adcfa34 100755 --- a/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py +++ b/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py @@ -37,6 +37,7 @@ import sys import helper_random +from antithesis.assertions import always, sometimes from helper_kafka import make_producer from helper_pg import query_one_retry from helper_quiet import request_quiet_period @@ -47,8 +48,6 @@ ensure_upsert_text_source, ) -from antithesis.assertions import always, sometimes - logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" ) diff --git a/test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py b/test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py index 59385a59a7ac7..53e791185b4ab 100755 --- a/test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py +++ b/test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py @@ -53,6 +53,7 @@ import helper_random import psycopg +from antithesis.assertions import always, sometimes from helper_pg import ( PGDATABASE, PGHOST, @@ -62,8 +63,6 @@ query_retry, ) -from antithesis.assertions import always, sometimes - logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" ) @@ -88,14 +87,17 @@ def _fresh_observed_tables(name_prefix: str) -> set[str] | None: rather than blaming the property for a fault-window read. """ try: - with psycopg.connect( - host=PGHOST, - port=PGPORT, - user=PGUSER, - dbname=PGDATABASE, - connect_timeout=int(PROBE_CONNECT_TIMEOUT_S), - autocommit=True, - ) as conn, conn.cursor() as cur: + with ( + psycopg.connect( + host=PGHOST, + port=PGPORT, + user=PGUSER, + dbname=PGDATABASE, + connect_timeout=int(PROBE_CONNECT_TIMEOUT_S), + autocommit=True, + ) as conn, + conn.cursor() as cur, + ): cur.execute( "SELECT name FROM mz_tables WHERE name LIKE %s", (f"{name_prefix}%",), @@ -155,7 +157,12 @@ def _run_cycle( try: execute_retry(f"DROP TABLE {table}") except Exception as exc: # noqa: BLE001 - LOG.info("cycle %d: DROP %s failed (%s); not updating model", cycle_idx, table, exc) + LOG.info( + "cycle %d: DROP %s failed (%s); not updating model", + cycle_idx, + table, + exc, + ) return False, new_id expected.discard(table) else: @@ -163,7 +170,12 @@ def _run_cycle( try: execute_retry(f"CREATE TABLE {table} (id BIGINT NOT NULL)") except Exception as exc: # noqa: BLE001 - LOG.info("cycle %d: CREATE %s failed (%s); not updating model", cycle_idx, table, exc) + LOG.info( + "cycle %d: CREATE %s failed (%s); not updating model", + cycle_idx, + table, + exc, + ) return False, new_id expected.add(table) new_id += 1 @@ -172,7 +184,9 @@ def _run_cycle( # assertion — a fault-window read is not regression evidence. observed = _fresh_observed_tables(name_prefix) if observed is None: - LOG.info("cycle %d: fresh-connection read failed; skipping assertion", cycle_idx) + LOG.info( + "cycle %d: fresh-connection read failed; skipping assertion", cycle_idx + ) return False, new_id always( diff --git a/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py b/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py index 5f3c13bcdce57..d7ccedb9e1a3b 100755 --- a/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py +++ b/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py @@ -58,6 +58,7 @@ import time import helper_random +from antithesis.assertions import always, sometimes from helper_kafka import make_producer from helper_pg import query_one_retry from helper_quiet import request_quiet_period @@ -68,8 +69,6 @@ ensure_upsert_text_source, ) -from antithesis.assertions import always, sometimes - logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" ) From d72fc00f59b41ac01b79a4721d0fcf12cd384810 Mon Sep 17 00:00:00 2001 From: Dennis Felsing Date: Wed, 13 May 2026 23:56:27 +0800 Subject: [PATCH 37/65] try to fix logging --- .../workload/test/parallel_driver_parallel_workload.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/antithesis/workload/test/parallel_driver_parallel_workload.py b/test/antithesis/workload/test/parallel_driver_parallel_workload.py index 1c8dadf8f641d..d5352500b9440 100644 --- a/test/antithesis/workload/test/parallel_driver_parallel_workload.py +++ b/test/antithesis/workload/test/parallel_driver_parallel_workload.py @@ -58,6 +58,7 @@ PGUSER_INTERNAL, ) +from materialize.parallel_workload import executor as _pw_executor from materialize.parallel_workload.action import ( ddl_action_list, dml_nontrans_action_list, @@ -83,6 +84,15 @@ from materialize.parallel_workload.worker import Worker from materialize.parallel_workload.worker_exception import WorkerFailedException +# `parallel_workload.executor` declares module-level `logging: TextIO | None` +# and `lock: threading.Lock` as PEP-526 annotations only; they are bound by +# `initialize_logging()`. `Executor.log()` does `if not logging: return`, +# which raises `NameError` before that initialiser runs. We don't want the +# per-query log file (drivers run many times under Antithesis); bind both +# names to no-op values so `log()` returns immediately. +_pw_executor.logging = None +_pw_executor.lock = threading.Lock() + logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" ) From bd5fbc4e02fa20d0825712b58a044d5b24d830f9 Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Wed, 13 May 2026 17:51:08 -0400 Subject: [PATCH 38/65] =?UTF-8?q?test/antithesis:=20helper=5Fpg.query=5Fre?= =?UTF-8?q?try=20=E2=80=94=20opt-in=20real=5Ftime=5Frecency=20kwarg?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wraps the SELECT in a session that has SET real_time_recency = TRUE. Under strict-serializable, this pushes the chosen-ts lower bound to the source's real-time upstream frontier, so the SELECT waits for ingestion to reach the broker/upstream high-water mark before responding. Existing 'wait_for_catchup' on mz_source_statistics.offset_committed is insufficient as a queryability gate: offset_committed tracks the data-shard upper, which can advance past oracle_read_ts via the source's reclock while the corresponding rows live at an mz_ts further forward (assigned by the next-probe binding). The strict-serializable SELECT then picks a chosen-ts between the two and returns count=0. Used by drivers that produce-then-assert against kafka/mysql sources. MV-over- table drivers don't need this; tables have no upstream to probe and the table writer's commit already advances the timestamp oracle. --- test/antithesis/workload/test/helper_pg.py | 29 +++++++++++++++++++--- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/test/antithesis/workload/test/helper_pg.py b/test/antithesis/workload/test/helper_pg.py index e3508c4f44b4a..59a88f1963ab3 100644 --- a/test/antithesis/workload/test/helper_pg.py +++ b/test/antithesis/workload/test/helper_pg.py @@ -108,13 +108,32 @@ def execute_retry(sql: str, params: Sequence[Any] | None = None) -> None: backoff = min(backoff * 2, _RETRY_MAX_S) -def query_retry(sql: str, params: Sequence[Any] | None = None) -> list[tuple[Any, ...]]: - """Run a query and return all rows, retrying transient errors.""" +def query_retry( + sql: str, + params: Sequence[Any] | None = None, + real_time_recency: bool = False, +) -> list[tuple[Any, ...]]: + """Run a query and return all rows, retrying transient errors. + + Set `real_time_recency=True` when the query is a queryability gate after a + just-produced upstream write. With strict-serializable (the workload + default) plus real-time recency, the coordinator pushes the SELECT + timestamp's lower bound to the source's real-time frontier — i.e. the + SELECT waits for ingestion to reach the broker/upstream's current + high-water mark before responding. Without this, `wait_for_catchup` on + `mz_source_statistics.offset_committed` can clear before the just-ingested + rows are visible at the timestamp the SELECT chooses (`offset_committed` + tracks the data-shard upper, which can advance past `oracle_read_ts` while + the rows live at an mz_ts further forward — assigned by the reclock's + next-probe binding). + """ deadline = time.monotonic() + _RETRY_BUDGET_S backoff = _RETRY_INITIAL_S while True: try: with connect() as conn, conn.cursor() as cur: + if real_time_recency: + cur.execute("SET real_time_recency = TRUE") cur.execute(sql, params or ()) return list(cur.fetchall()) except Exception as exc: # noqa: BLE001 @@ -126,9 +145,11 @@ def query_retry(sql: str, params: Sequence[Any] | None = None) -> list[tuple[Any def query_one_retry( - sql: str, params: Sequence[Any] | None = None + sql: str, + params: Sequence[Any] | None = None, + real_time_recency: bool = False, ) -> tuple[Any, ...] | None: - rows = query_retry(sql, params) + rows = query_retry(sql, params, real_time_recency=real_time_recency) return rows[0] if rows else None From 312537f18358a25f2477a34a0a0d1795960feaee Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Wed, 13 May 2026 17:51:32 -0400 Subject: [PATCH 39/65] test/antithesis: drivers use real_time_recency for queryability gate Apply real_time_recency=True to the SELECTs that follow wait_for_catchup or the equivalent in: - parallel_driver_upsert_latest_value.py - singleton_driver_upsert_state_rehydration.py - parallel_driver_kafka_none_envelope.py - parallel_driver_mysql_cdc.py These drivers all produce upstream (kafka/mysql), wait for a catchup signal, then SELECT and assert. The current catchup signal (offset_committed in mz_source_statistics, or a COUNT-based poll) clears before the just-ingested rows are visible at the strict-serializable read timestamp the SELECT picks: * offset_committed reflects the data-shard upper reclocked to upstream offsets. It can advance past oracle_read_ts via the source's reclock binding while the corresponding rows live at an mz_ts further forward (assigned by the next-probe binding). * COUNT-based polling only requires a single chosen-ts where the count matches; the immediately-following per-row SELECT picks oracle_read_ts afresh and can race. real_time_recency forces the SELECT's chosen-ts lower bound to the source's real-time upstream frontier, so the SELECT waits for ingestion to reach the broker's/replica's current high-water mark before responding. See the docstring on helper_pg.query_retry for the full reasoning. Not applied to parallel_driver_mv_reflects_table_updates: tables have no upstream to probe (RTR no-ops), and the existing count-based poll on the MV is already queryability-based. Not applied to parallel_driver_strict_serializable_reads: it already opens fresh connections with explicit SET REAL_TIME_RECENCY TO TRUE. --- .../workload/test/parallel_driver_kafka_none_envelope.py | 6 +++++- test/antithesis/workload/test/parallel_driver_mysql_cdc.py | 6 ++++++ .../workload/test/parallel_driver_upsert_latest_value.py | 5 +++++ .../test/singleton_driver_upsert_state_rehydration.py | 2 ++ 4 files changed, 18 insertions(+), 1 deletion(-) diff --git a/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py b/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py index 3c1a4e1359793..bbb4e2529eca8 100755 --- a/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py +++ b/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py @@ -132,7 +132,9 @@ def main() -> int: # ----- no-data-duplication ----- # `GROUP BY partition, "offset" HAVING COUNT(*) > 1` filtered to this # invocation's payloads. The catalog's `kafka-source-no-data-duplication` - # property names this exact query shape. + # property names this exact query shape. real_time_recency forces the + # SELECT past the kafka broker's real-time frontier — see + # helper_pg.query_retry for why this is required. dup_rows = query_retry( f""" SELECT partition, "offset", COUNT(*)::bigint @@ -142,6 +144,7 @@ def main() -> int: HAVING COUNT(*) > 1 """, (f"{prefix}:%",), + real_time_recency=True, ) always( len(dup_rows) == 0, @@ -175,6 +178,7 @@ def main() -> int: GROUP BY 1, 2, 3 """, (f"{prefix}:%",), + real_time_recency=True, ) by_payload: dict[str, tuple[int, int, int]] = {} for text, partition, offset, count in rows: diff --git a/test/antithesis/workload/test/parallel_driver_mysql_cdc.py b/test/antithesis/workload/test/parallel_driver_mysql_cdc.py index c51330251bad8..233207ff8e3c6 100644 --- a/test/antithesis/workload/test/parallel_driver_mysql_cdc.py +++ b/test/antithesis/workload/test/parallel_driver_mysql_cdc.py @@ -130,9 +130,14 @@ def _wait_for_catchup(batch_id: str, expected_count: int) -> bool: def _check_rows(expected: dict[str, str]) -> None: """Assert every expected row has the correct value in the Materialize source.""" for row_id, want in expected.items(): + # real_time_recency: the count-based catchup above can clear at a + # chosen-ts that just barely satisfies the COUNT, leaving a per-row + # SELECT moments later to race. RTR pushes chosen-ts to the mysql + # upstream's real-time frontier; see helper_pg.query_retry. rows = query_retry( f"SELECT value FROM {TABLE_NAME} WHERE id = %s", (row_id,), + real_time_recency=True, ) found = bool(rows) observed = rows[0][0] if found else None @@ -199,6 +204,7 @@ def main() -> int: rows = query_retry( f"SELECT COUNT(*)::bigint FROM {TABLE_NAME} WHERE batch_id = %s", (batch_id,), + real_time_recency=True, ) count_in_mz = int(rows[0][0]) if rows and rows[0][0] is not None else 0 always( diff --git a/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py b/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py index b58c15adcfa34..fcfabea77620d 100755 --- a/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py +++ b/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py @@ -85,9 +85,14 @@ def _select_value_for_key(key: str) -> tuple[bool, str | None]: out of scope for this property and should be caught by `kafka-source-no-data-duplication`. """ + # real_time_recency forces the SELECT timestamp past the kafka source's + # real-time upstream frontier, so the row written for this key is visible + # at chosen-ts. `wait_for_catchup` on `offset_committed` alone is not + # sufficient — see helper_pg.query_retry for the full reasoning. row = query_one_retry( f"SELECT count(*)::bigint, max(text) FROM {SOURCE_UPSERT_TEXT} WHERE key = %s", (key,), + real_time_recency=True, ) if row is None: return False, None diff --git a/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py b/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py index d7ccedb9e1a3b..26342d0ed43e8 100755 --- a/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py +++ b/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py @@ -91,9 +91,11 @@ def _select_value_for_key(key: str) -> tuple[bool, str | None]: """Duplicate of `_select_value_for_key` in `parallel_driver_upsert_latest_value.py`. Kept inline to avoid expanding helper surface for one shared private function.""" + # See helper_pg.query_retry for why real_time_recency is required here. row = query_one_retry( f"SELECT count(*)::bigint, max(text) FROM {SOURCE_UPSERT_TEXT} WHERE key = %s", (key,), + real_time_recency=True, ) if row is None: return False, None From d43144d5b7d71d3446d46f9435b2f1ac3d6f2d11 Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Wed, 13 May 2026 17:55:43 -0400 Subject: [PATCH 40/65] test/antithesis: parallel_driver_parallel_workload setup phase tolerates concurrent races MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Multiple parallel-driver invocations race the deterministic object-name pool in _create_database_for_antithesis (role0..roleN, cluster-0..cluster-N, etc). Setup statements run without IF NOT EXISTS / IF EXISTS guards in many places, and there is no IF EXISTS form for DROP CLUSTER or DROP ROLE — so the loser of any given race sees: * cluster 'cluster-0' already exists * unknown role 'role0' * unknown cluster 'cluster-0' * role "role0" cannot be dropped because some objects depend on it These are the same concurrent-DDL outcomes the parallel_workload framework already tolerates inside the worker loop via Action.errors_to_ignore at DDL complexity. The setup phase had no equivalent tolerance, so any of these escaped as a setup_failure and the always-zero-exit assertion fired: always(unexpected is None, "parallel workload: no unexpected SQL errors …") Add _tolerate_setup_race that catches QueryError or Exception with any of the expected race substrings and proceeds. Wrap every setup statement, including db.drop/db.create, the cluster/role enumerate-and-drop loops, the DROP/CREATE CONNECTION + SECRET statements, and the per-relation create loop. The pattern list mirrors action.Action.errors_to_ignore for the DDL tier. --- .../test/parallel_driver_parallel_workload.py | 95 +++++++++++++++---- 1 file changed, 78 insertions(+), 17 deletions(-) diff --git a/test/antithesis/workload/test/parallel_driver_parallel_workload.py b/test/antithesis/workload/test/parallel_driver_parallel_workload.py index d5352500b9440..ec3b91e9b38c6 100644 --- a/test/antithesis/workload/test/parallel_driver_parallel_workload.py +++ b/test/antithesis/workload/test/parallel_driver_parallel_workload.py @@ -58,6 +58,7 @@ PGUSER_INTERNAL, ) +from materialize.data_ingest.query_error import QueryError from materialize.parallel_workload import executor as _pw_executor from materialize.parallel_workload.action import ( ddl_action_list, @@ -175,47 +176,107 @@ def _prepare_system(num_threads: int) -> None: ) +# Expected substring matches for SQL errors raised during the setup phase when +# multiple parallel-driver invocations race the same deterministic object +# names (`role0`, `cluster-0`, etc.). Each invocation does best-effort cleanup +# + create; whoever loses the race sees one of these and continues. The same +# patterns are already tolerated by the parallel_workload framework itself in +# `action.Action.errors_to_ignore` for the DDL complexity tier, so the setup +# phase tolerates the same surface area. +_SETUP_RACE_PATTERNS = ( + "already exists", + "unknown role", + "unknown cluster", + "unknown schema", + "unknown catalog item", + "cannot be dropped because", + "was concurrently dropped", + "was removed", + "' was dropped", + "was dropped while executing a statement", + "another session modified the catalog", + "object state changed while transaction was in progress", +) + + +def _tolerate_setup_race(fn, *args, **kwargs): + """Run `fn(...)`, swallowing the concurrent-race messages in + `_SETUP_RACE_PATTERNS` and propagating anything else. + + The setup phase is invoked by every parallel-driver invocation, and the + framework picks deterministic object names from a small pool. Concurrent + invocations therefore race to drop-then-create the same names; any single + race outcome is fine because the per-invocation Database object only + needs its named objects to exist by the time worker threads start. + """ + try: + return fn(*args, **kwargs) + except QueryError as exc: + if any(pat in (exc.msg or "") for pat in _SETUP_RACE_PATTERNS): + LOG.debug("setup tolerated: %s — %s", exc.query, exc.msg) + return None + raise + except Exception as exc: # noqa: BLE001 + if any(pat in str(exc) for pat in _SETUP_RACE_PATTERNS): + LOG.debug("setup tolerated: %s", exc) + return None + raise + + def _create_database_for_antithesis(database: Database, exe: Executor) -> None: """Stand-in for `Database.create` that only sets up connections matching the Antithesis topology. Upstream's `create()` also wires polaris, sql-server, and an external postgres source — none of those are running - in this compose.""" + in this compose. + + Every statement is wrapped with `_tolerate_setup_race` because parallel + invocations of this driver race the same deterministic object names + (`role0..roleN`, `cluster-0..cluster-N`). Whoever loses the race for a + given object sees a known race message — already-exists, unknown-role, + unknown-cluster, or a transient DEPENDS-ON cleanup mismatch — and the + other invocation's outcome is fine for our purposes. + """ from pg8000.native import identifier for db in database.dbs: - db.drop(exe) - db.create(exe) + _tolerate_setup_race(db.drop, exe) + _tolerate_setup_race(db.create, exe) exe.execute("SELECT name FROM mz_clusters WHERE name LIKE 'c%'") for row in exe.cur.fetchall(): - exe.execute(f"DROP CLUSTER {identifier(row[0])} CASCADE") + _tolerate_setup_race( + exe.execute, f"DROP CLUSTER {identifier(row[0])} CASCADE" + ) - exe.execute("DROP SECRET IF EXISTS minio CASCADE") - exe.execute("DROP CONNECTION IF EXISTS aws_conn CASCADE") - exe.execute("DROP CONNECTION IF EXISTS kafka_conn CASCADE") - exe.execute("DROP CONNECTION IF EXISTS csr_conn CASCADE") + _tolerate_setup_race(exe.execute, "DROP SECRET IF EXISTS minio CASCADE") + _tolerate_setup_race(exe.execute, "DROP CONNECTION IF EXISTS aws_conn CASCADE") + _tolerate_setup_race(exe.execute, "DROP CONNECTION IF EXISTS kafka_conn CASCADE") + _tolerate_setup_race(exe.execute, "DROP CONNECTION IF EXISTS csr_conn CASCADE") exe.execute("SELECT name FROM mz_roles WHERE name LIKE 'r%'") for row in exe.cur.fetchall(): - exe.execute(f"DROP ROLE {identifier(row[0])}") + _tolerate_setup_race(exe.execute, f"DROP ROLE {identifier(row[0])}") - exe.execute( + _tolerate_setup_race( + exe.execute, "CREATE CONNECTION IF NOT EXISTS kafka_conn FOR KAFKA " - "BROKER 'kafka:9092', SECURITY PROTOCOL PLAINTEXT" + "BROKER 'kafka:9092', SECURITY PROTOCOL PLAINTEXT", ) - exe.execute( + _tolerate_setup_race( + exe.execute, "CREATE CONNECTION IF NOT EXISTS csr_conn FOR CONFLUENT SCHEMA " - "REGISTRY URL 'http://schema-registry:8081'" + "REGISTRY URL 'http://schema-registry:8081'", ) - exe.execute("CREATE SECRET IF NOT EXISTS minio AS 'minioadmin'") - exe.execute( + _tolerate_setup_race(exe.execute, "CREATE SECRET IF NOT EXISTS minio AS 'minioadmin'") + _tolerate_setup_race( + exe.execute, "CREATE CONNECTION IF NOT EXISTS aws_conn TO AWS (" "ENDPOINT 'http://minio:9000/', REGION 'minio', " - "ACCESS KEY ID 'minioadmin', SECRET ACCESS KEY SECRET minio)" + "ACCESS KEY ID 'minioadmin', SECRET ACCESS KEY SECRET minio)", ) for relation in database: - relation.create(exe) + _tolerate_setup_race(relation.create, exe) def _spawn_workers( From 26a70cee0475ba9cf7f29fa3c1cab7a109566ed2 Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Wed, 13 May 2026 17:57:38 -0400 Subject: [PATCH 41/65] test/antithesis: _replica_non_online queries history table, not current-state view MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Sometimes assertions: * fault recovery: observed antithesis_cluster replica non-online at least once * kafka source resumes: observed antithesis_cluster replica non-online both rely on `_replica_non_online()` returning True at least once across all invocations in a run. The previous implementation queried `mz_cluster_replica_statuses` (DISTINCT ON (replica_id, process_id) over the underlying history shard), which shows only the latest tick per process. With a 0.5s probe cadence and a 30s invocation budget, an Antithesis fault that takes a clusterd offline-then-back-online within a sub-second window slips between two consecutive polls — the SDK never sees a non-online status, the Sometimes assertion never fires, and we get a 0-pass / N-fail finding even though the fault recipe is correctly hitting the cluster. Switch to `mz_internal.mz_cluster_replica_status_history` and filter on `h.status = 'offline'`. This is the underlying audit log; any past offline event remains visible from any later poll within the retention window, so we record the fault even if the transition fully completed before the next probe. Same change in both drivers (the helper was duplicated). --- .../test/anytime_fault_recovery_exercised.py | 18 ++++++++++++++---- ...anytime_kafka_source_resumes_after_fault.py | 12 +++++++++--- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/test/antithesis/workload/test/anytime_fault_recovery_exercised.py b/test/antithesis/workload/test/anytime_fault_recovery_exercised.py index ff90867b0b6f5..65f3ed4f695f0 100755 --- a/test/antithesis/workload/test/anytime_fault_recovery_exercised.py +++ b/test/antithesis/workload/test/anytime_fault_recovery_exercised.py @@ -93,7 +93,17 @@ def _probe_select_one() -> bool: def _replica_non_online() -> bool: - """Best-effort: is any antithesis-cluster replica reporting non-online? + """Did any antithesis_cluster replica record an `offline` status at any + point in this timeline? + + Queries `mz_cluster_replica_status_history` (audit log) rather than + `mz_cluster_replica_statuses` (current-state view). The current-state + view shows only the latest tick per (replica, process), so a transient + offline window — exactly the shape Antithesis fault injection creates + when it pauses or kills clusterd1 / clusterd2 for a few seconds — can + open and close between two consecutive polls and the assertion never + fires. The history table is sticky: once an offline event is recorded + it stays observable from any later poll within the retention window. Uses the retry-budgeted query helper because we want a clear yes/no, not a probe outcome — if the helper can't get an answer we conservatively @@ -105,10 +115,10 @@ def _replica_non_online() -> bool: """ SELECT EXISTS ( SELECT 1 - FROM mz_internal.mz_cluster_replica_statuses s - JOIN mz_cluster_replicas r ON r.id = s.replica_id + FROM mz_internal.mz_cluster_replica_status_history h + JOIN mz_cluster_replicas r ON r.id = h.replica_id JOIN mz_clusters c ON c.id = r.cluster_id - WHERE c.name = %s AND s.status != 'online' + WHERE c.name = %s AND h.status = 'offline' ) """, (ANTITHESIS_CLUSTER,), diff --git a/test/antithesis/workload/test/anytime_kafka_source_resumes_after_fault.py b/test/antithesis/workload/test/anytime_kafka_source_resumes_after_fault.py index b453f62631aac..9c10879bd8291 100755 --- a/test/antithesis/workload/test/anytime_kafka_source_resumes_after_fault.py +++ b/test/antithesis/workload/test/anytime_kafka_source_resumes_after_fault.py @@ -100,15 +100,21 @@ def _offset_committed(source_name: str) -> int | None: def _replica_non_online() -> bool: + """Did any antithesis_cluster replica record an `offline` status in this + timeline? Queries the audit history (`mz_cluster_replica_status_history`) + rather than the current-state view so a transient offline window between + two polls is still observable. See the matching helper in + `anytime_fault_recovery_exercised.py` for the full reasoning. + """ try: row = query_one_retry( """ SELECT EXISTS ( SELECT 1 - FROM mz_internal.mz_cluster_replica_statuses s - JOIN mz_cluster_replicas r ON r.id = s.replica_id + FROM mz_internal.mz_cluster_replica_status_history h + JOIN mz_cluster_replicas r ON r.id = h.replica_id JOIN mz_clusters c ON c.id = r.cluster_id - WHERE c.name = %s AND s.status != 'online' + WHERE c.name = %s AND h.status = 'offline' ) """, (ANTITHESIS_CLUSTER,), From adaed905255634354b94aef816f60a3764c62069 Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Wed, 13 May 2026 18:17:00 -0400 Subject: [PATCH 42/65] parallel_workload: pool-backed mode with seed-scoped names and external clusterds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three coupled additions to make parallel_workload safe to run as multiple concurrent invocations sharing one Materialize instance, with each invocation's cluster routed to a dedicated external clusterd container. 1. Database(seed_scoped_names: bool = False). When True, forwards a 'name_scope' string to every Role and Cluster the framework creates, producing 'cluster--' and 'role--' rather than 'cluster-' / 'role'. Schemas / tables / views / sources etc. don't need this — their fully-qualified names already flow through DB.name() which already embeds the seed. Default False so non- Antithesis consumers keep their existing name shapes. Role.__str__ now passes through pg8000.native.identifier() so the quoted-dashed names round-trip correctly; no-op for ASCII names. 2. Database(pool_members: list[ClusterdPoolMember] | None = None) and a new ClusterdPoolMember dataclass (host + storagectl/computectl/ compute/storage ports + workers). When set, the framework provisions unmanaged Cluster replicas with explicit STORAGECTL/STORAGE/ COMPUTECTL/COMPUTE ADDRESSES pointed at the supplied member(s) instead of emitting managed SIZE/REPLICATION FACTOR. The is_pool_backed property on Cluster gates the rendering. 3. CreateClusterAction / CreateClusterReplicaAction / DropClusterReplicaAction skip pool-backed clusters: there is no in-band allocator for grabbing additional pool members from a worker thread, and replication factor manipulation has no analogue in unmanaged-replica mode. The framework therefore only ever touches the pool members the caller pre-allocated. These three pieces only make sense together: seed-scoping by itself doesn't isolate the clusterd workload; the pool backend by itself collides on global names; skipping the dynamic DDL by itself would just leave clusters un-grown in a managed-cluster topology where that's the workload. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../materialize/parallel_workload/action.py | 23 +- .../materialize/parallel_workload/database.py | 207 ++++++++++++++++-- 2 files changed, 206 insertions(+), 24 deletions(-) diff --git a/misc/python/materialize/parallel_workload/action.py b/misc/python/materialize/parallel_workload/action.py index 3318bee4aeaf5..89d9c0801fc76 100644 --- a/misc/python/materialize/parallel_workload/action.py +++ b/misc/python/materialize/parallel_workload/action.py @@ -1987,7 +1987,7 @@ def run(self, exe: Executor) -> bool: return False role_id = exe.db.role_id exe.db.role_id += 1 - role = Role(role_id) + role = Role(role_id, name_scope=exe.db.name_scope) role.create(exe) exe.db.roles.append(role) return True @@ -2026,6 +2026,13 @@ def run(self, exe: Executor) -> bool: class CreateClusterAction(Action): def run(self, exe: Executor) -> bool: + # In pool mode the Database's clusters are wired to pre-existing + # clusterd containers from a finite pool the caller passed in. + # Dynamically creating a new cluster would need to claim an unused + # pool member, and we don't have an allocator. Skip — the initial + # clusters set up at construction time are the test surface. + if exe.db.pool_members is not None: + return False with exe.db.lock: if len(exe.db.clusters) >= MAX_CLUSTERS: return False @@ -2037,6 +2044,7 @@ def run(self, exe: Executor) -> bool: size=self.rng.choice(["1", "2"]), replication_factor=self.rng.choice([1, 2]), introspection_interval="1s", + name_scope=exe.db.name_scope, ) cluster.create(exe) exe.db.clusters.append(cluster) @@ -2170,6 +2178,12 @@ def run(self, exe: Executor) -> bool: with exe.db.lock: # Keep cluster 0 with 1 replica for sources/sinks unmanaged_clusters = [c for c in exe.db.clusters[1:] if not c.managed] + # Pool-backed clusters can't grow their replica count — there's + # no pool allocator handing out a fresh ClusterdPoolMember per + # ALTER CLUSTER ADD REPLICA. Skip them. + unmanaged_clusters = [ + c for c in unmanaged_clusters if not c.is_pool_backed + ] if not unmanaged_clusters: return False cluster = self.rng.choice(unmanaged_clusters) @@ -2193,6 +2207,13 @@ def run(self, exe: Executor) -> bool: with exe.db.lock: # Keep cluster 0 with 1 replica for sources/sinks unmanaged_clusters = [c for c in exe.db.clusters[1:] if not c.managed] + # Pool-backed clusters can't shrink either — without an + # allocator to release the pool member back, the in-memory + # model would diverge from materialize's catalog and later + # creates targeting the freed slot would conflict. + unmanaged_clusters = [ + c for c in unmanaged_clusters if not c.is_pool_backed + ] if not unmanaged_clusters: return False cluster = self.rng.choice(unmanaged_clusters) diff --git a/misc/python/materialize/parallel_workload/database.py b/misc/python/materialize/parallel_workload/database.py index bad0b4081bbde..4eecbdf2ea2f3 100644 --- a/misc/python/materialize/parallel_workload/database.py +++ b/misc/python/materialize/parallel_workload/database.py @@ -7,6 +7,7 @@ # the Business Source License, use of this software will be governed # by the Apache License, Version 2.0. +import dataclasses import random import threading import uuid @@ -885,31 +886,81 @@ def __str__(self) -> str: class Role: role_id: int lock: threading.Lock - - def __init__(self, role_id: int): + # Inserted between `role` and `{role_id}` in the generated name. Empty by + # default (giving the historical `role0` shape). When set, gives + # `role{name_scope}{role_id}` — used by callers like the Antithesis + # parallel-driver where many concurrent Database instances against one + # materialize would otherwise collide on the same `role0..roleN` names. + name_scope: str + + def __init__(self, role_id: int, name_scope: str = ""): self.role_id = role_id self.lock = threading.Lock() + self.name_scope = name_scope def __str__(self) -> str: + # Format: `role[-{name_scope}-]{role_id}`. The bracketed segment is + # only present when seed-scoping is on, so the historical `role0` + # shape (which non-Antithesis consumers parse) is preserved. + # Scoped names need identifier-quoting because dashes aren't valid + # in an unquoted identifier; unscoped names stay bare to match the + # original SQL the framework emits. + if self.name_scope: + return identifier(f"role-{self.name_scope}-{self.role_id}") return f"role{self.role_id}" def create(self, exe: Executor) -> None: exe.execute(f"CREATE ROLE {self}") +@dataclasses.dataclass(frozen=True) +class ClusterdPoolMember: + """One entry in an external clusterd pool that a `Cluster` can target as + an unmanaged replica. + + Used by callers (Antithesis parallel-driver) that want fault-isolation + per cluster: each pool member is its own container, so Antithesis can + kill/pause/partition exactly one cluster's storage+compute without + taking down the other clusters that share the materialized container's + process orchestrator. + + The default ports match clusterd's defaults; override per environment. + """ + + host: str + storagectl_port: int = 2100 + computectl_port: int = 2101 + compute_port: int = 2102 + storage_port: int = 2103 + workers: int = 4 + + class ClusterReplica: replica_id: int size: str cluster: "Cluster" rename: int lock: threading.Lock + # When non-None, the replica is wired to a pre-existing clusterd + # container via unmanaged-cluster syntax (STORAGECTL/COMPUTE ADDRESSES) + # rather than provisioned through the orchestrator. The replica's + # `size` field is ignored in that case; `pool_member.workers` provides + # the WORKERS clause. + pool_member: ClusterdPoolMember | None - def __init__(self, replica_id: int, size: str, cluster: "Cluster"): + def __init__( + self, + replica_id: int, + size: str, + cluster: "Cluster", + pool_member: ClusterdPoolMember | None = None, + ): self.replica_id = replica_id self.size = size self.cluster = cluster self.rename = 0 self.lock = threading.Lock() + self.pool_member = pool_member def name(self) -> str: if self.rename: @@ -935,6 +986,12 @@ class Cluster: introspection_interval: str rename: int lock: threading.Lock + # Inserted between `cluster` and `-{cluster_id}` in the generated name. + # Empty by default (giving the historical `cluster-N` shape). When set, + # gives `cluster{name_scope}-N` — used by callers like the Antithesis + # parallel-driver, where many concurrent Database instances against one + # materialize would otherwise collide on the same `cluster-N` names. + name_scope: str def __init__( self, @@ -943,29 +1000,84 @@ def __init__( size: str, replication_factor: int, introspection_interval: str, + name_scope: str = "", + pool_members: list[ClusterdPoolMember] | None = None, ): self.cluster_id = cluster_id self.managed = managed self.size = size - self.replicas = [ - ClusterReplica(i, size, self) for i in range(replication_factor) - ] + # When `pool_members` is supplied, the cluster runs in unmanaged mode + # against one pre-existing clusterd container per replica. We force + # `managed=False` (the unmanaged-cluster syntax is what carries the + # STORAGECTL/COMPUTE ADDRESSES clauses) and ignore `replication_factor` + # in favour of `len(pool_members)`. + if pool_members is not None: + if not pool_members: + raise ValueError( + "pool_members must be non-empty when provided; one member per replica" + ) + self.managed = False + self.replicas = [ + ClusterReplica(i, size, self, pool_member=pool_members[i]) + for i in range(len(pool_members)) + ] + else: + self.replicas = [ + ClusterReplica(i, size, self) for i in range(replication_factor) + ] self.replica_id = len(self.replicas) self.introspection_interval = introspection_interval self.rename = 0 self.lock = threading.Lock() + self.name_scope = name_scope + + @property + def is_pool_backed(self) -> bool: + """True iff every replica is wired to a pre-existing clusterd + container rather than provisioned through the orchestrator. Action + classes that would mutate replica count check this and bail — + we don't dynamically allocate from the pool.""" + return all(r.pool_member is not None for r in self.replicas) def name(self) -> str: + # Format: `cluster[-{name_scope}]-{cluster_id}[-{rename}]`. The + # bracketed `-{name_scope}` segment is only present when seed- + # scoping is on, so the historical `cluster-0` / `cluster-0-1` + # shapes (which non-Antithesis consumers parse) are preserved. + prefix = ( + f"cluster-{self.name_scope}" if self.name_scope else "cluster" + ) if self.rename: - return naughtify(f"cluster-{self.cluster_id}-{self.rename}") - return naughtify(f"cluster-{self.cluster_id}") + return naughtify(f"{prefix}-{self.cluster_id}-{self.rename}") + return naughtify(f"{prefix}-{self.cluster_id}") def __str__(self) -> str: return identifier(self.name()) def create(self, exe: Executor) -> None: query = f"CREATE CLUSTER {self} " - if self.managed: + if self.is_pool_backed: + # Unmanaged cluster pointing at pre-existing clusterd containers. + # Each replica gets the STORAGECTL/STORAGE/COMPUTECTL/COMPUTE + # ADDRESSES of its pool member; WORKERS comes from the pool + # member's config. Requires + # `unsafe_enable_unorchestrated_cluster_replicas = true` on the + # SUT (see test/antithesis/mzcompose.py for the Antithesis case). + replica_specs = [] + for replica in self.replicas: + assert replica.pool_member is not None + m = replica.pool_member + replica_specs.append( + f"{replica} (" + f"STORAGECTL ADDRESSES ['{m.host}:{m.storagectl_port}'], " + f"STORAGE ADDRESSES ['{m.host}:{m.storage_port}'], " + f"COMPUTECTL ADDRESSES ['{m.host}:{m.computectl_port}'], " + f"COMPUTE ADDRESSES ['{m.host}:{m.compute_port}'], " + f"WORKERS {m.workers}" + f")" + ) + query += "REPLICAS(" + ", ".join(replica_specs) + ")" + elif self.managed: query += f"SIZE = '{self.size}', REPLICATION FACTOR = {len(self.replicas)}, INTROSPECTION INTERVAL = '{self.introspection_interval}'" else: query += "REPLICAS(" @@ -1025,12 +1137,35 @@ def __init__( complexity: Complexity, scenario: Scenario, naughty_identifiers: bool, + # When True, top-level objects whose names are not schema-qualified + # (clusters and roles) are scoped by the database seed so concurrent + # Database instances against one materialize don't collide. Off by + # default; opted into by the Antithesis parallel-driver where many + # invocations share the SUT. Tables / schemas / views are already + # qualified by DB.name() which includes the seed, so they don't + # need this. + seed_scoped_names: bool = False, + # When non-None, every cluster the Database creates uses the + # external clusterd-pool backend (unmanaged-with-explicit-addresses) + # rather than the orchestrator. The Database slices this list one + # member per replica across its clusters at construction time. + # See `ClusterdPoolMember` for the shape; sized to fit the + # database's initial cluster + replica plan. + pool_members: list[ClusterdPoolMember] | None = None, ): self.host = host self.ports = ports self.complexity = complexity self.scenario = scenario self.seed = seed + self.seed_scoped_names = seed_scoped_names + self.pool_members = pool_members + # The bare seed (no leading/trailing punctuation) used by Cluster / + # Role / etc. to assemble their scoped names. Empty when seed-scoping + # is off, in which case those classes fall back to their historical + # `cluster-N` / `role0` shapes. See Cluster.name() and Role.__str__() + # for how the seed gets inlaid. + self.name_scope = seed if seed_scoped_names else "" set_naughty_identifiers(naughty_identifiers) self.s3_path = 0 @@ -1064,21 +1199,47 @@ def __init__( ) self.views.append(view) self.view_id = len(self.views) - self.roles = [Role(i) for i in range(rng.randint(0, MAX_INITIAL_ROLES))] - self.role_id = len(self.roles) - # At least one storage cluster required for WebhookSources - self.clusters = [ - Cluster( - i, - managed=rng.choice([True, False]), - size=rng.choice( - ["scale=1,workers=1", "scale=1,workers=4", "scale=2,workers=2"] - ), - replication_factor=1, - introspection_interval="1s", - ) - for i in range(rng.randint(1, MAX_INITIAL_CLUSTERS)) + self.roles = [ + Role(i, name_scope=self.name_scope) + for i in range(rng.randint(0, MAX_INITIAL_ROLES)) ] + self.role_id = len(self.roles) + # At least one storage cluster required for WebhookSources. + # In pool mode, each cluster claims one pool member from a + # deterministic slice; the number of clusters is the slice size, no + # rng.randint. Caller is responsible for sizing `pool_members` to + # the desired cluster count. + if pool_members is not None: + initial_cluster_count = len(pool_members) + self.clusters = [ + Cluster( + i, + # managed/size are ignored when pool-backed but kept as + # placeholder values for any code that reads them + # without consulting `is_pool_backed`. + managed=False, + size=pool_members[i].host, + replication_factor=1, + introspection_interval="1s", + name_scope=self.name_scope, + pool_members=[pool_members[i]], + ) + for i in range(initial_cluster_count) + ] + else: + self.clusters = [ + Cluster( + i, + managed=rng.choice([True, False]), + size=rng.choice( + ["scale=1,workers=1", "scale=1,workers=4", "scale=2,workers=2"] + ), + replication_factor=1, + introspection_interval="1s", + name_scope=self.name_scope, + ) + for i in range(rng.randint(1, MAX_INITIAL_CLUSTERS)) + ] self.cluster_id = len(self.clusters) self.indexes = set() self.webhook_sources = [ From 19db5376106104507a9e881afc9fd342cdcacb96 Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Wed, 13 May 2026 18:27:05 -0400 Subject: [PATCH 43/65] test/antithesis: add configurable clusterd pool for parallel-workload MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reserve a pool of pre-existing clusterd containers (`clusterd-pool-{0..N-1}`) so each parallel-workload cluster can land on its own container and Antithesis can fault-inject it in isolation. Without the pool, parallel-workload's clusters all live as child processes of environmentd under the materialized container's process orchestrator, and Antithesis can only kill / pause / partition the container as a unit. Pool size is read from `ANTITHESIS_CLUSTERD_POOL_SIZE` (env), default 8. Each member is identical to clusterd1 / clusterd2: 4 timely workers, no scratch, restart=no. `workflow_default` brings them up before materialized so the controller can reach them when CREATE CLUSTER references their addresses. `Materialized` already has `unsafe_enable_unorchestrated_cluster_replicas` set so CREATE CLUSTER ... STORAGECTL ADDRESSES is accepted. config/docker-compose.yaml regenerated via bin/pyactivate test/antithesis/export-compose.py to match — the YAML is generated, not hand-edited. --- test/antithesis/config/docker-compose.yaml | 304 +++++++++++++++++++++ test/antithesis/mzcompose.py | 44 +++ 2 files changed, 348 insertions(+) diff --git a/test/antithesis/config/docker-compose.yaml b/test/antithesis/config/docker-compose.yaml index 446c9d0a189f6..900b586870e75 100644 --- a/test/antithesis/config/docker-compose.yaml +++ b/test/antithesis/config/docker-compose.yaml @@ -311,6 +311,310 @@ services: stop_grace_period: 120s platform: linux/amd64 image: ${MATERIALIZED_IMAGE} + clusterd-pool-0: + entrypoint: + - tini + - -- + command: + - clusterd + ports: + - 2100 + - 2101 + - 6878 + environment: + - CLUSTERD_GRPC_HOST=clusterd-pool-0 + - CLUSTERD_USE_CTP=true + - MZ_SOFT_ASSERTIONS=1 + - CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2100 + - CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2101 + - CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=0.0.0.0:6878 + - CLUSTERD_SECRETS_READER=local-file + - CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=/mzdata/secrets + - LD_PRELOAD=libeatmydata.so + - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879 + - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0 + - CLUSTERD_PROCESS=0 + - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-0:2102"], + "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc": + false, "zero_copy_limit": null}' + - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-0:2103"], + "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc": + false, "zero_copy_limit": null}' + volumes: + - mzdata:/mzdata + - mydata:/var/lib/mysql-files + - tmp:/share/tmp + - scratch:/scratch + restart: 'no' + stop_grace_period: 120s + platform: linux/amd64 + image: ${MATERIALIZED_IMAGE} + clusterd-pool-1: + entrypoint: + - tini + - -- + command: + - clusterd + ports: + - 2100 + - 2101 + - 6878 + environment: + - CLUSTERD_GRPC_HOST=clusterd-pool-1 + - CLUSTERD_USE_CTP=true + - MZ_SOFT_ASSERTIONS=1 + - CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2100 + - CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2101 + - CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=0.0.0.0:6878 + - CLUSTERD_SECRETS_READER=local-file + - CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=/mzdata/secrets + - LD_PRELOAD=libeatmydata.so + - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879 + - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0 + - CLUSTERD_PROCESS=0 + - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-1:2102"], + "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc": + false, "zero_copy_limit": null}' + - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-1:2103"], + "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc": + false, "zero_copy_limit": null}' + volumes: + - mzdata:/mzdata + - mydata:/var/lib/mysql-files + - tmp:/share/tmp + - scratch:/scratch + restart: 'no' + stop_grace_period: 120s + platform: linux/amd64 + image: ${MATERIALIZED_IMAGE} + clusterd-pool-2: + entrypoint: + - tini + - -- + command: + - clusterd + ports: + - 2100 + - 2101 + - 6878 + environment: + - CLUSTERD_GRPC_HOST=clusterd-pool-2 + - CLUSTERD_USE_CTP=true + - MZ_SOFT_ASSERTIONS=1 + - CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2100 + - CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2101 + - CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=0.0.0.0:6878 + - CLUSTERD_SECRETS_READER=local-file + - CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=/mzdata/secrets + - LD_PRELOAD=libeatmydata.so + - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879 + - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0 + - CLUSTERD_PROCESS=0 + - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-2:2102"], + "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc": + false, "zero_copy_limit": null}' + - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-2:2103"], + "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc": + false, "zero_copy_limit": null}' + volumes: + - mzdata:/mzdata + - mydata:/var/lib/mysql-files + - tmp:/share/tmp + - scratch:/scratch + restart: 'no' + stop_grace_period: 120s + platform: linux/amd64 + image: ${MATERIALIZED_IMAGE} + clusterd-pool-3: + entrypoint: + - tini + - -- + command: + - clusterd + ports: + - 2100 + - 2101 + - 6878 + environment: + - CLUSTERD_GRPC_HOST=clusterd-pool-3 + - CLUSTERD_USE_CTP=true + - MZ_SOFT_ASSERTIONS=1 + - CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2100 + - CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2101 + - CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=0.0.0.0:6878 + - CLUSTERD_SECRETS_READER=local-file + - CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=/mzdata/secrets + - LD_PRELOAD=libeatmydata.so + - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879 + - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0 + - CLUSTERD_PROCESS=0 + - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-3:2102"], + "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc": + false, "zero_copy_limit": null}' + - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-3:2103"], + "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc": + false, "zero_copy_limit": null}' + volumes: + - mzdata:/mzdata + - mydata:/var/lib/mysql-files + - tmp:/share/tmp + - scratch:/scratch + restart: 'no' + stop_grace_period: 120s + platform: linux/amd64 + image: ${MATERIALIZED_IMAGE} + clusterd-pool-4: + entrypoint: + - tini + - -- + command: + - clusterd + ports: + - 2100 + - 2101 + - 6878 + environment: + - CLUSTERD_GRPC_HOST=clusterd-pool-4 + - CLUSTERD_USE_CTP=true + - MZ_SOFT_ASSERTIONS=1 + - CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2100 + - CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2101 + - CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=0.0.0.0:6878 + - CLUSTERD_SECRETS_READER=local-file + - CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=/mzdata/secrets + - LD_PRELOAD=libeatmydata.so + - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879 + - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0 + - CLUSTERD_PROCESS=0 + - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-4:2102"], + "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc": + false, "zero_copy_limit": null}' + - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-4:2103"], + "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc": + false, "zero_copy_limit": null}' + volumes: + - mzdata:/mzdata + - mydata:/var/lib/mysql-files + - tmp:/share/tmp + - scratch:/scratch + restart: 'no' + stop_grace_period: 120s + platform: linux/amd64 + image: ${MATERIALIZED_IMAGE} + clusterd-pool-5: + entrypoint: + - tini + - -- + command: + - clusterd + ports: + - 2100 + - 2101 + - 6878 + environment: + - CLUSTERD_GRPC_HOST=clusterd-pool-5 + - CLUSTERD_USE_CTP=true + - MZ_SOFT_ASSERTIONS=1 + - CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2100 + - CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2101 + - CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=0.0.0.0:6878 + - CLUSTERD_SECRETS_READER=local-file + - CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=/mzdata/secrets + - LD_PRELOAD=libeatmydata.so + - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879 + - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0 + - CLUSTERD_PROCESS=0 + - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-5:2102"], + "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc": + false, "zero_copy_limit": null}' + - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-5:2103"], + "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc": + false, "zero_copy_limit": null}' + volumes: + - mzdata:/mzdata + - mydata:/var/lib/mysql-files + - tmp:/share/tmp + - scratch:/scratch + restart: 'no' + stop_grace_period: 120s + platform: linux/amd64 + image: ${MATERIALIZED_IMAGE} + clusterd-pool-6: + entrypoint: + - tini + - -- + command: + - clusterd + ports: + - 2100 + - 2101 + - 6878 + environment: + - CLUSTERD_GRPC_HOST=clusterd-pool-6 + - CLUSTERD_USE_CTP=true + - MZ_SOFT_ASSERTIONS=1 + - CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2100 + - CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2101 + - CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=0.0.0.0:6878 + - CLUSTERD_SECRETS_READER=local-file + - CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=/mzdata/secrets + - LD_PRELOAD=libeatmydata.so + - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879 + - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0 + - CLUSTERD_PROCESS=0 + - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-6:2102"], + "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc": + false, "zero_copy_limit": null}' + - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-6:2103"], + "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc": + false, "zero_copy_limit": null}' + volumes: + - mzdata:/mzdata + - mydata:/var/lib/mysql-files + - tmp:/share/tmp + - scratch:/scratch + restart: 'no' + stop_grace_period: 120s + platform: linux/amd64 + image: ${MATERIALIZED_IMAGE} + clusterd-pool-7: + entrypoint: + - tini + - -- + command: + - clusterd + ports: + - 2100 + - 2101 + - 6878 + environment: + - CLUSTERD_GRPC_HOST=clusterd-pool-7 + - CLUSTERD_USE_CTP=true + - MZ_SOFT_ASSERTIONS=1 + - CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2100 + - CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2101 + - CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=0.0.0.0:6878 + - CLUSTERD_SECRETS_READER=local-file + - CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=/mzdata/secrets + - LD_PRELOAD=libeatmydata.so + - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879 + - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0 + - CLUSTERD_PROCESS=0 + - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-7:2102"], + "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc": + false, "zero_copy_limit": null}' + - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-7:2103"], + "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc": + false, "zero_copy_limit": null}' + volumes: + - mzdata:/mzdata + - mydata:/var/lib/mysql-files + - tmp:/share/tmp + - scratch:/scratch + restart: 'no' + stop_grace_period: 120s + platform: linux/amd64 + image: ${MATERIALIZED_IMAGE} materialized: hostname: materialized depends_on: diff --git a/test/antithesis/mzcompose.py b/test/antithesis/mzcompose.py index 5f7da9d8f0e97..d66c63eb3348f 100644 --- a/test/antithesis/mzcompose.py +++ b/test/antithesis/mzcompose.py @@ -20,6 +20,16 @@ Antithesis killing either container exercises the compute/storage-replica recovery and rebalancing paths without taking the cluster offline. + - clusterd-pool-{0..N-1} : a configurable pool of external clusterd + containers that the parallel-workload driver + claims one-per-cluster to give each + parallel-workload cluster its own container. + Without this pool, parallel-workload clusters + would all share materialized's process orchestrator + and Antithesis could only fault the entire + container as a unit. Pool size is controlled by + the `ANTITHESIS_CLUSTERD_POOL_SIZE` env var (read + from the harness; defaults to 8). - materialized : the SUT (environmentd; clusterd is external) - workload : Python test driver wired to the Antithesis SDK @@ -28,6 +38,8 @@ bin/pyactivate test/antithesis/export-compose.py > config/... # dump compose YAML """ +import os + from materialize.mzcompose.composition import Composition from materialize.mzcompose.service import Service, ServiceConfig from materialize.mzcompose.services.clusterd import Clusterd @@ -39,6 +51,15 @@ from materialize.mzcompose.services.schema_registry import SchemaRegistry from materialize.mzcompose.services.zookeeper import Zookeeper +# Number of pool clusterd containers reserved for parallel-workload clusters +# (one container per cluster, giving each its own container-level fault +# domain). Read from the env so CI/local runs can tune it without editing +# this file. Default 8 — enough for ~8 concurrent parallel-driver +# invocations under the v1 "one cluster per invocation, replication +# factor 1" allocation, see test/antithesis/workload/test/ +# parallel_driver_parallel_workload.py. +CLUSTERD_POOL_SIZE = int(os.environ.get("ANTITHESIS_CLUSTERD_POOL_SIZE", "8")) + class Workload(Service): """Antithesis workload client — Python test driver.""" @@ -146,6 +167,27 @@ def __init__(self) -> None: workers=4, scratch_directory=None, ), + # Pool of identical clusterd containers reserved for the + # parallel-workload driver. Each instance is a possible target for + # one parallel-workload cluster, giving that cluster its own + # container-level fault domain (Antithesis can kill / pause / + # partition / throttle a specific pool member without affecting any + # other cluster). Same settings as clusterd1/clusterd2: 4 timely + # workers per process, no scratch (matches production), restart=no + # so Antithesis fault injection isn't fought by docker-compose. + # + # Sizing rationale lives in test/antithesis/workload/test/ + # parallel_driver_parallel_workload.py — the driver maps invocation + # seed → pool slot deterministically and assumes the pool is at + # least as big as the expected concurrent-invocation count. + *[ + Clusterd( + name=f"clusterd-pool-{i}", + workers=4, + scratch_directory=None, + ) + for i in range(CLUSTERD_POOL_SIZE) + ], Materialized( external_blob_store=True, external_metadata_store=True, @@ -166,6 +208,7 @@ def __init__(self) -> None: def workflow_default(c: Composition) -> None: """Bring up the Antithesis test cluster.""" + pool_services = [f"clusterd-pool-{i}" for i in range(CLUSTERD_POOL_SIZE)] c.up( "postgres-metadata", "minio", @@ -174,6 +217,7 @@ def workflow_default(c: Composition) -> None: "schema-registry", "clusterd1", "clusterd2", + *pool_services, "mysql", "mysql-replica", ) From 550f6f65d97f575c057aa2e7422452ef6cbfd2a0 Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Wed, 13 May 2026 18:29:37 -0400 Subject: [PATCH 44/65] test/antithesis: parallel-workload driver runs on per-invocation pool clusterd MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wires the antithesis parallel-workload driver to: * Claim one clusterd-pool-{i} container per invocation via a real allocator (fcntl.flock on /tmp/clusterd-pool-slots/{i}.lock; lock held for the lifetime of the invocation, released on normal return or exception). Slots are tried in randomized order so the slot a driver lands on doesn't correlate with the invocation seed. If every slot is held the driver tags a sometimes() and exits cleanly rather than running unisolated. * Construct ClusterdPoolMember(host='clusterd-pool-', workers=4) and pass to Database(pool_members=[member], seed_scoped_names=True). The initial cluster lands on its own clusterd container — Antithesis fault injection targets that container in isolation, which is the point of the whole change. * Scope setup-phase catalog sweeps to objects this invocation owns: 'cluster--%' and 'role--%'. The previous 'c%' / 'r%' patterns would have torn down every concurrent invocation's still-running state. The shared connections (kafka_conn, csr_conn, aws_conn, minio) live outside any seed-scoped database; we never drop them (CREATE ... IF NOT EXISTS is idempotent and dropping would CASCADE through another invocation's in-flight sources). * Drop seed-scoped clusters / databases / roles in main()'s finally so each invocation leaves the catalog clean and frees its pool- slot's clusterd. The DROP CLUSTER on an unmanaged cluster re-arms the clusterd to accept a fresh controller connection via the same reconcile() path that handles environmentd restarts (storage_state::reconcile drops stale objects, transport::serve cancels the prior connection on the next connect). Pool size is read from CLUSTERD_POOL_SIZE (env), matching the ANTITHESIS_CLUSTERD_POOL_SIZE knob in test/antithesis/mzcompose.py. Default 8. v1 scope (documented for the next round of work): * MAX_INITIAL_CLUSTERS = 1 per invocation, REPLICATION FACTOR = 1. Multi-replica coverage stays in antithesis_cluster. * CreateClusterAction / CreateClusterReplicaAction / DropClusterReplicaAction are skipped in pool mode; no in-band allocator inside the framework yet. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../test/parallel_driver_parallel_workload.py | 311 +++++++++++++++--- 1 file changed, 259 insertions(+), 52 deletions(-) diff --git a/test/antithesis/workload/test/parallel_driver_parallel_workload.py b/test/antithesis/workload/test/parallel_driver_parallel_workload.py index ec3b91e9b38c6..7d370f2048222 100644 --- a/test/antithesis/workload/test/parallel_driver_parallel_workload.py +++ b/test/antithesis/workload/test/parallel_driver_parallel_workload.py @@ -38,6 +38,8 @@ from __future__ import annotations +import contextlib +import fcntl import logging import os import random @@ -78,6 +80,7 @@ MAX_TABLES, MAX_VIEWS, MAX_WEBHOOK_SOURCES, + ClusterdPoolMember, Database, ) from materialize.parallel_workload.executor import Executor @@ -105,6 +108,30 @@ RUNTIME_S = float(os.environ.get("PW_RUNTIME_S", "20")) NUM_THREADS = int(os.environ.get("PW_THREADS", "4")) +# Number of clusterd-pool-{i} containers reserved for the parallel-workload +# driver. Must match the pool actually deployed in +# test/antithesis/mzcompose.py (ANTITHESIS_CLUSTERD_POOL_SIZE there → +# CLUSTERD_POOL_SIZE here). Each parallel-workload invocation claims one +# slot via `fcntl.flock` (see `_claim_pool_slot`); the lock is held for +# the lifetime of the invocation so concurrent driver processes inside +# the workload container can't pick the same clusterd. +CLUSTERD_POOL_SIZE = int(os.environ.get("CLUSTERD_POOL_SIZE", "8")) + +# Workers configured per clusterd-pool-{i} process. Must match the +# `Clusterd(..., workers=...)` argument in test/antithesis/mzcompose.py +# or the unmanaged CREATE CLUSTER REPLICA's `WORKERS` count will diverge +# from what clusterd actually runs. +CLUSTERD_POOL_WORKERS = 4 + +# Filesystem locks let concurrent parallel-workload invocations claim +# distinct clusterd-pool members without coordinating through the SUT. +# All invocations exec inside the single `workload` container so a +# regular flock on a tmpfs path is sufficient (no cross-container +# coordination required). +POOL_SLOT_LOCK_DIR = os.environ.get( + "CLUSTERD_POOL_SLOT_LOCK_DIR", "/tmp/clusterd-pool-slots" +) + def _alter_system(cur: psycopg.Cursor[Any], stmt: str) -> None: try: @@ -223,37 +250,152 @@ def _tolerate_setup_race(fn, *args, **kwargs): raise +@contextlib.contextmanager +def _claim_pool_slot(rng: random.Random): + """Hold an exclusive `fcntl.flock` on a pool-slot lockfile for the + duration of the `with` block. Yields the slot index, or `None` if every + slot is busy. + + Slots are tried in a randomized order so the slot a driver lands on + doesn't correlate with deterministic state (test composer seed, wall + clock). The lock is released when the context exits — either normally + or via exception — so a crashing driver doesn't strand the slot. + + All parallel-workload driver invocations share the workload container's + filesystem, so a plain flock on a tmpfs path under `POOL_SLOT_LOCK_DIR` + is sufficient to serialize claims. If the path can't be created we fall + back to yielding `None` (caller must handle: the existing setup-tolerance + path can absorb a slot collision, it just costs us pool isolation for + that one invocation). + """ + try: + os.makedirs(POOL_SLOT_LOCK_DIR, exist_ok=True) + except OSError as exc: + LOG.warning("pool slot lock dir %s unavailable: %s", POOL_SLOT_LOCK_DIR, exc) + yield None + return + + slots = list(range(CLUSTERD_POOL_SIZE)) + rng.shuffle(slots) + for slot in slots: + path = os.path.join(POOL_SLOT_LOCK_DIR, f"{slot}.lock") + fd = os.open(path, os.O_RDWR | os.O_CREAT, 0o600) + try: + fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB) + except OSError: + # Another invocation owns this slot; try the next one. + os.close(fd) + continue + try: + yield slot + finally: + try: + fcntl.flock(fd, fcntl.LOCK_UN) + finally: + os.close(fd) + return + LOG.warning("all %d pool slots are claimed; running without isolation", CLUSTERD_POOL_SIZE) + yield None + + +def _drop_seed_scoped_objects(seed: str) -> None: + """Drop everything this invocation's seed owns: its clusters, roles, and + databases. Called from `main()`'s finally so each invocation leaves the + catalog clean and frees its pool-slot's clusterd to be claimed by the + next driver run (DROP CLUSTER tears down the unmanaged replica → the + clusterd's existing controller connection ends → the next CREATE + CLUSTER pointed at the same address claims it via fresh reconcile). + + Errors here are logged and swallowed: leftover objects only cost a bit + of catalog footprint until the next invocation's setup sweep picks them + up. Don't let a cleanup failure turn into an assertion failure. + """ + from pg8000.native import identifier + + try: + with ( + psycopg.connect( + host=PGHOST, + port=PGPORT, + user=PGUSER, + dbname=PGDATABASE, + autocommit=True, + connect_timeout=15, + ) as conn, + conn.cursor() as cur, + ): + # `seed` is u64-derived; safe to splice. We can't use psycopg's + # parameter binding for `LIKE` patterns here without forcing the + # caller to think about driver-specific placeholder syntax — + # inline f-strings match the rest of this module. + def _drop(sql: str) -> None: + try: + cur.execute(sql.encode()) + except Exception as exc: # noqa: BLE001 + LOG.debug("cleanup tolerated: %s — %s", sql, exc) + + cur.execute( + f"SELECT name FROM mz_clusters WHERE name LIKE 'cluster-{seed}-%'".encode() + ) + for row in cur.fetchall(): + _drop(f"DROP CLUSTER {identifier(row[0])} CASCADE") + + cur.execute( + f"SELECT name FROM mz_databases WHERE name LIKE 'db-pw-{seed}-%'".encode() + ) + for row in cur.fetchall(): + _drop(f"DROP DATABASE {identifier(row[0])} CASCADE") + + cur.execute( + f"SELECT name FROM mz_roles WHERE name LIKE 'role-{seed}-%'".encode() + ) + for row in cur.fetchall(): + _drop(f"DROP ROLE {identifier(row[0])}") + except Exception as exc: # noqa: BLE001 + LOG.warning("cleanup connection failed: %s", exc) + + def _create_database_for_antithesis(database: Database, exe: Executor) -> None: """Stand-in for `Database.create` that only sets up connections matching the Antithesis topology. Upstream's `create()` also wires polaris, sql-server, and an external postgres source — none of those are running in this compose. - Every statement is wrapped with `_tolerate_setup_race` because parallel - invocations of this driver race the same deterministic object names - (`role0..roleN`, `cluster-0..cluster-N`). Whoever loses the race for a - given object sees a known race message — already-exists, unknown-role, - unknown-cluster, or a transient DEPENDS-ON cleanup mismatch — and the - other invocation's outcome is fine for our purposes. + Catalog sweeps are scoped to objects this invocation owns: clusters + matching `cluster-{seed}-%` and roles matching `role-{seed}-%`. The + seed-scoped names are produced by `Database(seed_scoped_names=True)`; + cleaning anything broader would delete state belonging to other + concurrent invocations sharing the same SUT. + + The shared connections / secret (`kafka_conn`, `csr_conn`, `aws_conn`, + `minio`) live outside any seed-scoped database and are required by every + invocation. We never drop them — `CREATE ... IF NOT EXISTS` is + idempotent and dropping would CASCADE through another invocation's + in-flight sources. + + Setup-phase statements are wrapped with `_tolerate_setup_race` so a + losing race against another invocation creating the same shared object + (or against our own scoped leftovers being already absent) doesn't kill + the driver. """ from pg8000.native import identifier + seed = database.seed + for db in database.dbs: _tolerate_setup_race(db.drop, exe) _tolerate_setup_race(db.create, exe) - exe.execute("SELECT name FROM mz_clusters WHERE name LIKE 'c%'") + # `seed` is the random_u64 the driver minted at the top of main(), so + # it's already safe to splice into SQL literally. `Executor.execute` + # takes a query string and doesn't support parameter binding. + exe.execute(f"SELECT name FROM mz_clusters WHERE name LIKE 'cluster-{seed}-%'") for row in exe.cur.fetchall(): _tolerate_setup_race( exe.execute, f"DROP CLUSTER {identifier(row[0])} CASCADE" ) - _tolerate_setup_race(exe.execute, "DROP SECRET IF EXISTS minio CASCADE") - _tolerate_setup_race(exe.execute, "DROP CONNECTION IF EXISTS aws_conn CASCADE") - _tolerate_setup_race(exe.execute, "DROP CONNECTION IF EXISTS kafka_conn CASCADE") - _tolerate_setup_race(exe.execute, "DROP CONNECTION IF EXISTS csr_conn CASCADE") - - exe.execute("SELECT name FROM mz_roles WHERE name LIKE 'r%'") + exe.execute(f"SELECT name FROM mz_roles WHERE name LIKE 'role-{seed}-%'") for row in exe.cur.fetchall(): _tolerate_setup_race(exe.execute, f"DROP ROLE {identifier(row[0])}") @@ -340,9 +482,65 @@ def main() -> int: _prepare_system(NUM_THREADS) + # Claim one clusterd-pool-{i} container for this invocation. The flock + # is held until main() returns; concurrent invocations inside the + # workload container can't pick the same slot. If every slot is busy + # the context manager yields `None` — we tag that with a sometimes() + # for visibility and exit cleanly (the property surface for this + # invocation just doesn't get exercised). + # + # Each parallel-workload cluster lands on its own clusterd-pool-{slot} + # container, giving Antithesis per-cluster fault isolation. Without + # this, every parallel-workload cluster would be a child process of + # environmentd under the materialized container's process orchestrator, + # and the only container-level fault would be "the whole world". + with _claim_pool_slot(rng) as pool_slot: + sometimes( + pool_slot is not None, + "parallel workload: clusterd pool slot claimed", + {"pool_size": CLUSTERD_POOL_SIZE}, + ) + if pool_slot is None: + LOG.info( + "parallel-workload exiting cleanly: no pool slot available " + "(pool_size=%d)", + CLUSTERD_POOL_SIZE, + ) + return 0 + pool_member = ClusterdPoolMember( + host=f"clusterd-pool-{pool_slot}", + workers=CLUSTERD_POOL_WORKERS, + ) + LOG.info( + "parallel-workload claimed pool slot %d (%s)", + pool_slot, + pool_member.host, + ) + return _run_invocation(seed, rng, pool_member) + + +def _run_invocation( + seed: str, + rng: random.Random, + pool_member: ClusterdPoolMember, +) -> int: + """The bulk of `main()` once a pool slot has been claimed. Split out so + the slot lock stays held across this whole call: the lock is released + when the enclosing `with` block in `main()` exits. + """ + # `Scenario.Kill` widens `Action.errors_to_ignore` to absorb connection # drops, which mirrors what Antithesis container-pauses look like at the # client. We never instantiate `KillAction` itself. + # + # `seed_scoped_names=True` keeps cluster/role names from colliding when + # concurrent invocations share the SUT — see _SETUP_RACE_PATTERNS for + # the fallback when they collide anyway. + # + # `pool_members=[pool_member]` puts this invocation's single cluster + # on the pool member above; the framework forces managed=False and + # emits unmanaged CREATE CLUSTER with explicit STORAGECTL/COMPUTE + # ADDRESSES. database = Database( rng=rng, seed=seed, @@ -357,54 +555,63 @@ def main() -> int: complexity=Complexity.DDL, scenario=Scenario.Kill, naughty_identifiers=False, + seed_scoped_names=True, + pool_members=[pool_member], ) end_time = time.time() + RUNTIME_S setup_failure: Exception | None = None - try: - with ( - psycopg.connect( - host=PGHOST, - port=PGPORT, - user=PGUSER, - dbname=PGDATABASE, - autocommit=True, - connect_timeout=15, - ) as setup_conn, - setup_conn.cursor() as setup_cur, - ): - setup_exe = Executor(rng, setup_cur, None, database) - _create_database_for_antithesis(database, setup_exe) - except Exception as exc: # noqa: BLE001 - setup_failure = exc - LOG.exception("parallel-workload setup failed") - workers: list[Worker] = [] threads: list[threading.Thread] = [] worker_failed: WorkerFailedException | None = None - if setup_failure is None: - workers, threads = _spawn_workers(rng, database, end_time, NUM_THREADS) + try: try: - while time.time() < end_time: - dead = [t for t in threads if not t.is_alive()] - if dead: - occurred = next( - (w.occurred_exception for w in workers if w.occurred_exception), - None, - ) - worker_failed = WorkerFailedException( - f"thread {dead[0].name} exited early", occurred - ) - for worker in workers: - worker.end_time = time.time() - break - time.sleep(0.5) - finally: - for worker in workers: - worker.end_time = time.time() - for thread in threads: - thread.join(timeout=30) + with ( + psycopg.connect( + host=PGHOST, + port=PGPORT, + user=PGUSER, + dbname=PGDATABASE, + autocommit=True, + connect_timeout=15, + ) as setup_conn, + setup_conn.cursor() as setup_cur, + ): + setup_exe = Executor(rng, setup_cur, None, database) + _create_database_for_antithesis(database, setup_exe) + except Exception as exc: # noqa: BLE001 + setup_failure = exc + LOG.exception("parallel-workload setup failed") + + if setup_failure is None: + workers, threads = _spawn_workers(rng, database, end_time, NUM_THREADS) + try: + while time.time() < end_time: + dead = [t for t in threads if not t.is_alive()] + if dead: + occurred = next( + (w.occurred_exception for w in workers if w.occurred_exception), + None, + ) + worker_failed = WorkerFailedException( + f"thread {dead[0].name} exited early", occurred + ) + for worker in workers: + worker.end_time = time.time() + break + time.sleep(0.5) + finally: + for worker in workers: + worker.end_time = time.time() + for thread in threads: + thread.join(timeout=30) + finally: + # Always free this invocation's seed-scoped state, including its + # pool-slot cluster, so the next driver invocation can claim the + # slot cleanly. Wrapped in try/except inside the helper; any + # cleanup failure is logged but never escapes. + _drop_seed_scoped_objects(seed) total_queries = sum(w.num_queries.total() for w in workers) total_ignored = sum( From 008830b0b5faae7c9b6fe0eb00bd6f39f3c2a01b Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Wed, 13 May 2026 19:00:40 -0400 Subject: [PATCH 45/65] test/antithesis/scratchbook: per-cluster fault isolation for parallel-workload Documents the pool-backed parallel-workload topology: pool of clusterd containers, file-lock slot allocator, seed-scoped naming, drop-on-exit, and the reconcile-path correctness argument for clusterd reuse across DROP/CREATE CLUSTER cycles. Lists current failure modes (all-slots-held, crash-before-cleanup, sizing) and v1 limitations the next round of work will close. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../parallel-workload-fault-isolation.md | 163 ++++++++++++++++++ 1 file changed, 163 insertions(+) create mode 100644 test/antithesis/scratchbook/parallel-workload-fault-isolation.md diff --git a/test/antithesis/scratchbook/parallel-workload-fault-isolation.md b/test/antithesis/scratchbook/parallel-workload-fault-isolation.md new file mode 100644 index 0000000000000..8cbe235b8183b --- /dev/null +++ b/test/antithesis/scratchbook/parallel-workload-fault-isolation.md @@ -0,0 +1,163 @@ +# Per-cluster fault isolation for parallel-workload under Antithesis + +## Problem + +Antithesis fault injection operates at the docker-compose container +boundary: it kills, pauses, partitions, throttles individual containers. +For per-cluster fault coverage to be observable, each cluster the SUT +allocates needs to live in its own container — otherwise "fault one +cluster" reduces to "fault every cluster sharing this container". + +The Antithesis compose has one `materialized` container running +environmentd. By default, every cluster a workload provisions becomes a +clusterd child process under that environmentd's process orchestrator. +Antithesis cannot fault a single child process; the smallest fault unit +is the whole `materialized` container, which is "the entire SUT". + +The `antithesis_cluster` (the always-on user cluster the long-running +workloads target) is already an unmanaged cluster pointed at two external +clusterd containers (`clusterd1`, `clusterd2`), one per replica. That +gives us per-replica fault coverage for that cluster. + +The gap is `parallel-workload` clusters. The randomized stress driver +creates new clusters as part of its action surface. Without external +clusterds, every parallel-workload cluster collapses back onto +environmentd's process orchestrator and the fault domain disappears. + +## Solution + +A pool of identical pre-deployed clusterd containers +(`clusterd-pool-{0..N-1}`), one container per parallel-workload +invocation. Each invocation claims one slot via filesystem locking, +provisions its sole cluster as an unmanaged replica pointed at that +slot's clusterd, and releases the slot on exit. + +Components, bottom-up: + + - **`Clusterd(name="clusterd-pool-{i}", workers=4, scratch_directory=None)`** + in `test/antithesis/mzcompose.py`. Same configuration as + `clusterd1`/`clusterd2`: four timely workers per process (so + Antithesis thread-pause faults have something distinct to pause), + mem_env RocksDB (matches production, no scratch volume to fight over). + Pool size from env (`ANTITHESIS_CLUSTERD_POOL_SIZE`, default 8). + + - **`parallel_workload.Database(pool_members=..., + seed_scoped_names=True)`**. Opt-in framework mode: when + `pool_members` is set, the framework provisions unmanaged clusters + with explicit STORAGECTL/STORAGE/COMPUTECTL/COMPUTE ADDRESSES + instead of managed SIZE/REPLICATION FACTOR; the CreateCluster / + CreateReplica / DropReplica actions skip pool-backed clusters + because there is no in-band allocator. `seed_scoped_names=True` + renames `cluster{N}` / `role{N}` to `cluster-{seed}-{N}` / + `role-{seed}-{N}` so concurrent invocations don't collide on + global names. + + - **`_claim_pool_slot()`** in + `test/antithesis/workload/test/parallel_driver_parallel_workload.py`. + Contextmanager that holds `fcntl.flock(LOCK_EX | LOCK_NB)` on + `/tmp/clusterd-pool-slots/{i}.lock` for the lifetime of the + invocation. Slots tried in randomized order so allocation is + decorrelated from invocation seed. The lock is released on context + exit (normal or exception), so a crashing driver doesn't strand the + slot. + + - **`_drop_seed_scoped_objects()`** in the same driver, called in + `main()`'s `finally`. Drops every cluster / database / role whose + name starts with `cluster-{seed}-` / `db-pw-{seed}-` / + `role-{seed}-`. The DROP CLUSTER re-arms the clusterd to be + claimed by the next invocation through the reconcile path + (see below). + +## Clusterd reuse correctness + +The pool design assumes a DROP CLUSTER followed by a CREATE CLUSTER +pointed at the same clusterd is a supported transition. It is — this is +the same reconciliation path that handles environmentd restart. The +three pieces: + + 1. **Transport cancels the prior connection on every new connect.** + `src/service/src/transport.rs::serve` drops the old + connection-task token and awaits the task before installing a + fresh handler from `handler_fn()`. The new `ClusterClient` is a + blank-slate wrapper around the same `Arc>`. + + 2. **The worker `run` loop survives client disconnects.** + `src/storage/src/storage_state.rs::Worker::run` is + `while let Some((nonce, rx, tx)) = client_rx.blocking_recv() { + run_client(rx, tx); }`. When the old `cmd_tx` is dropped (because + the cancel above tore down the prior client), `run_client` returns + and the outer loop awaits the next `(nonce, rx, tx)` — the new + controller's connection. Worker in-memory state stays resident + between connections. + + 3. **`reconcile()` drops stale state.** The new controller's first + batch of commands ending in `InitializationComplete` is processed + by `storage_state::reconcile`: it computes `expected_objects` from + the new commands, identifies `stale_objects` as anything the + worker knows about that the new controller did not ask for, and + `drop_collection`s each one — releasing source tokens (which tears + down Kafka consumers, persist write handles, upsert RocksDB state), + dropping dataflows, clearing reported frontiers. + +Collection IDs do not collide across cluster lifetimes because +Materialize allocates them globally (`u`, `t`), not per cluster. + +The one piece intentionally shared across reconnects is the +`Arc`. It is keyed by URL+credentials, not by +cluster identity, and reusing it is the standard production behavior +(avoids reauthenticating to S3 / postgres-metadata on every reconnect). + +The same analysis holds for the compute side (`src/compute/src/server.rs` +uses the same `ClusterSpec` pattern). + +## Failure modes + + - **All pool slots held.** Driver tags `sometimes(...)` for + visibility and exits cleanly. With the default pool size (8) and + the test composer's normal concurrency this is not expected to + fire, but if it does we'll see it in the run report. + + - **Crash before drop-on-exit runs.** The flock is released + automatically when the process dies (kernel-level lock release). + The clusterd is left holding stale state until the next claimant + reconciles. Catalog leftovers (`cluster-{seed}-*`, + `role-{seed}-*`, `db-pw-{seed}-*`) accumulate until the next + invocation with the same seed runs its setup sweep — extremely + unlikely since seeds are u64-random. The setup sweep is scoped + to the current seed only, so it does not clean cross-invocation + leftovers. A periodic external cleanup or a startup-time scan + against `mz_clusters` / `mz_roles` / `mz_databases` would be + needed to close this loop properly. For now the catalog growth + is bounded by run length and not currently a problem. + + - **Pool sizing wrong vs concurrency.** If concurrency exceeds pool + size, the late arrivals get "no slot" and exit. We do not currently + auto-tune; bump `ANTITHESIS_CLUSTERD_POOL_SIZE` if telemetry shows + the "no slot available" signal firing. + +## v1 limitations (future work) + + - **REPLICATION FACTOR 1, no multi-replica parallel-workload coverage.** + The pool gives each invocation one container; multi-replica + coverage for compute/storage paths remains in `antithesis_cluster`. + + - **No in-band allocator inside the framework.** Worker threads + cannot grab additional pool members mid-run, so + `CreateClusterAction` / `CreateClusterReplicaAction` / + `DropClusterReplicaAction` are skipped when pool-backed. The + framework only ever touches the pre-allocated pool members. + + - **No global GC of cross-invocation catalog leftovers.** See + failure modes above. A first-invocation sweep against + `mz_clusters WHERE name LIKE 'cluster-%-%'` minus the current + seed would close this; deferred until it becomes a problem. + +## Tunables + +| Variable | Default | Effect | +|---|---|---| +| `ANTITHESIS_CLUSTERD_POOL_SIZE` (compose) | 8 | Number of `clusterd-pool-{i}` containers deployed. | +| `CLUSTERD_POOL_SIZE` (driver) | 8 | Number of slots the driver will attempt to claim. Must match the compose value. | +| `CLUSTERD_POOL_SLOT_LOCK_DIR` (driver) | `/tmp/clusterd-pool-slots` | Directory holding the per-slot flock files. | +| `PW_RUNTIME_S` (driver) | 20 | Per-invocation runtime; bound to keep the fault-injection budget granular. | +| `PW_THREADS` (driver) | 4 | Worker threads inside one invocation. | From 84bdebe000191e38e953108067bde19d5f997216 Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Wed, 13 May 2026 19:04:17 -0400 Subject: [PATCH 46/65] parallel_workload: pool mode provisions one cluster with N replicas, not N single-replica clusters The driver now claims best-effort up to PW_DESIRED_REPLICAS (default 2) clusterd-pool slots per invocation; the framework consumes the whole pool_members list into a single unmanaged cluster with one replica per member instead of one cluster per member. Gives multi-replica fault coverage to parallel-workload (previously only antithesis_cluster ran multi-replica) when pool capacity allows, and degrades gracefully to a single-replica cluster under contention. The driver helper renamed from _claim_pool_slot (yields one slot or None) to _claim_pool_slots (yields list of 0..desired slots) so the contention fallback is just a shorter list rather than a special case. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../materialize/parallel_workload/database.py | 19 ++- .../parallel-workload-fault-isolation.md | 36 ++--- .../test/parallel_driver_parallel_workload.py | 151 ++++++++++-------- 3 files changed, 109 insertions(+), 97 deletions(-) diff --git a/misc/python/materialize/parallel_workload/database.py b/misc/python/materialize/parallel_workload/database.py index 4eecbdf2ea2f3..be639f9145f82 100644 --- a/misc/python/materialize/parallel_workload/database.py +++ b/misc/python/materialize/parallel_workload/database.py @@ -1205,26 +1205,25 @@ def __init__( ] self.role_id = len(self.roles) # At least one storage cluster required for WebhookSources. - # In pool mode, each cluster claims one pool member from a - # deterministic slice; the number of clusters is the slice size, no - # rng.randint. Caller is responsible for sizing `pool_members` to - # the desired cluster count. + # In pool mode, the entire `pool_members` list is consumed by a + # single unmanaged cluster — one replica per member — so the + # caller controls both replica count and pool-member identity. + # This is the only initial cluster; CreateClusterAction is + # disabled in pool mode (no in-band allocator). if pool_members is not None: - initial_cluster_count = len(pool_members) self.clusters = [ Cluster( - i, + 0, # managed/size are ignored when pool-backed but kept as # placeholder values for any code that reads them # without consulting `is_pool_backed`. managed=False, - size=pool_members[i].host, - replication_factor=1, + size=pool_members[0].host, + replication_factor=len(pool_members), introspection_interval="1s", name_scope=self.name_scope, - pool_members=[pool_members[i]], + pool_members=pool_members, ) - for i in range(initial_cluster_count) ] else: self.clusters = [ diff --git a/test/antithesis/scratchbook/parallel-workload-fault-isolation.md b/test/antithesis/scratchbook/parallel-workload-fault-isolation.md index 8cbe235b8183b..f9e182c043ab5 100644 --- a/test/antithesis/scratchbook/parallel-workload-fault-isolation.md +++ b/test/antithesis/scratchbook/parallel-workload-fault-isolation.md @@ -27,10 +27,12 @@ environmentd's process orchestrator and the fault domain disappears. ## Solution A pool of identical pre-deployed clusterd containers -(`clusterd-pool-{0..N-1}`), one container per parallel-workload -invocation. Each invocation claims one slot via filesystem locking, -provisions its sole cluster as an unmanaged replica pointed at that -slot's clusterd, and releases the slot on exit. +(`clusterd-pool-{0..N-1}`). Each invocation claims up to +`PW_DESIRED_REPLICAS` (default 2) slots via filesystem locking and +provisions a single unmanaged cluster with one replica per claimed +slot, then releases the locks on exit. Best-effort: with N slots +claimed the cluster runs as an N-replica cluster (1 ≤ N ≤ desired); +no slots → exit cleanly. Components, bottom-up: @@ -43,23 +45,24 @@ Components, bottom-up: - **`parallel_workload.Database(pool_members=..., seed_scoped_names=True)`**. Opt-in framework mode: when - `pool_members` is set, the framework provisions unmanaged clusters - with explicit STORAGECTL/STORAGE/COMPUTECTL/COMPUTE ADDRESSES - instead of managed SIZE/REPLICATION FACTOR; the CreateCluster / + `pool_members` is set, the framework provisions one unmanaged + cluster with `len(pool_members)` replicas, each pointed at a pool + member via explicit STORAGECTL/STORAGE/COMPUTECTL/COMPUTE ADDRESSES + (in place of managed SIZE/REPLICATION FACTOR); the CreateCluster / CreateReplica / DropReplica actions skip pool-backed clusters because there is no in-band allocator. `seed_scoped_names=True` renames `cluster{N}` / `role{N}` to `cluster-{seed}-{N}` / `role-{seed}-{N}` so concurrent invocations don't collide on global names. - - **`_claim_pool_slot()`** in + - **`_claim_pool_slots()`** in `test/antithesis/workload/test/parallel_driver_parallel_workload.py`. - Contextmanager that holds `fcntl.flock(LOCK_EX | LOCK_NB)` on - `/tmp/clusterd-pool-slots/{i}.lock` for the lifetime of the - invocation. Slots tried in randomized order so allocation is - decorrelated from invocation seed. The lock is released on context - exit (normal or exception), so a crashing driver doesn't strand the - slot. + Contextmanager that holds up to `PW_DESIRED_REPLICAS` exclusive + `fcntl.flock`s on `/tmp/clusterd-pool-slots/{i}.lock` for the + lifetime of the invocation. Slots are tried in randomized order so + allocation is decorrelated from invocation seed. Every claimed lock + is released on context exit (normal or exception), so a crashing + driver doesn't strand any slot. - **`_drop_seed_scoped_objects()`** in the same driver, called in `main()`'s `finally`. Drops every cluster / database / role whose @@ -137,10 +140,6 @@ uses the same `ClusterSpec` pattern). ## v1 limitations (future work) - - **REPLICATION FACTOR 1, no multi-replica parallel-workload coverage.** - The pool gives each invocation one container; multi-replica - coverage for compute/storage paths remains in `antithesis_cluster`. - - **No in-band allocator inside the framework.** Worker threads cannot grab additional pool members mid-run, so `CreateClusterAction` / `CreateClusterReplicaAction` / @@ -159,5 +158,6 @@ uses the same `ClusterSpec` pattern). | `ANTITHESIS_CLUSTERD_POOL_SIZE` (compose) | 8 | Number of `clusterd-pool-{i}` containers deployed. | | `CLUSTERD_POOL_SIZE` (driver) | 8 | Number of slots the driver will attempt to claim. Must match the compose value. | | `CLUSTERD_POOL_SLOT_LOCK_DIR` (driver) | `/tmp/clusterd-pool-slots` | Directory holding the per-slot flock files. | +| `PW_DESIRED_REPLICAS` (driver) | 2 | Replicas to ask for per invocation's cluster. Best-effort: driver claims up to this many slots and runs with whatever it gets (≥1). | | `PW_RUNTIME_S` (driver) | 20 | Per-invocation runtime; bound to keep the fault-injection budget granular. | | `PW_THREADS` (driver) | 4 | Worker threads inside one invocation. | diff --git a/test/antithesis/workload/test/parallel_driver_parallel_workload.py b/test/antithesis/workload/test/parallel_driver_parallel_workload.py index 7d370f2048222..5929f88327590 100644 --- a/test/antithesis/workload/test/parallel_driver_parallel_workload.py +++ b/test/antithesis/workload/test/parallel_driver_parallel_workload.py @@ -111,10 +111,10 @@ # Number of clusterd-pool-{i} containers reserved for the parallel-workload # driver. Must match the pool actually deployed in # test/antithesis/mzcompose.py (ANTITHESIS_CLUSTERD_POOL_SIZE there → -# CLUSTERD_POOL_SIZE here). Each parallel-workload invocation claims one -# slot via `fcntl.flock` (see `_claim_pool_slot`); the lock is held for -# the lifetime of the invocation so concurrent driver processes inside -# the workload container can't pick the same clusterd. +# CLUSTERD_POOL_SIZE here). Each parallel-workload invocation claims +# slots via `fcntl.flock` (see `_claim_pool_slots`); the locks are held +# for the lifetime of the invocation so concurrent driver processes +# inside the workload container can't pick the same clusterd. CLUSTERD_POOL_SIZE = int(os.environ.get("CLUSTERD_POOL_SIZE", "8")) # Workers configured per clusterd-pool-{i} process. Must match the @@ -123,6 +123,14 @@ # from what clusterd actually runs. CLUSTERD_POOL_WORKERS = 4 +# Replicas to ask for per invocation's cluster. Best-effort: the driver +# claims up to this many pool slots and runs whatever it gets (≥1). With +# DESIRED_REPLICAS=2 and POOL_SIZE=8 we get multi-replica coverage for +# the parallel-workload cluster (currently only `antithesis_cluster` is +# multi-replica) when capacity allows, while degrading gracefully to a +# single-replica cluster under contention. +DESIRED_REPLICAS = int(os.environ.get("PW_DESIRED_REPLICAS", "2")) + # Filesystem locks let concurrent parallel-workload invocations claim # distinct clusterd-pool members without coordinating through the SUT. # All invocations exec inside the single `workload` container so a @@ -251,51 +259,52 @@ def _tolerate_setup_race(fn, *args, **kwargs): @contextlib.contextmanager -def _claim_pool_slot(rng: random.Random): - """Hold an exclusive `fcntl.flock` on a pool-slot lockfile for the - duration of the `with` block. Yields the slot index, or `None` if every - slot is busy. - - Slots are tried in a randomized order so the slot a driver lands on - doesn't correlate with deterministic state (test composer seed, wall - clock). The lock is released when the context exits — either normally - or via exception — so a crashing driver doesn't strand the slot. - - All parallel-workload driver invocations share the workload container's - filesystem, so a plain flock on a tmpfs path under `POOL_SLOT_LOCK_DIR` - is sufficient to serialize claims. If the path can't be created we fall - back to yielding `None` (caller must handle: the existing setup-tolerance - path can absorb a slot collision, it just costs us pool isolation for - that one invocation). +def _claim_pool_slots(rng: random.Random, desired: int): + """Hold exclusive `fcntl.flock`s on up to `desired` pool-slot lockfiles + for the duration of the `with` block. Yields the list of claimed slot + indices (length 0–`desired`); the caller decides what to do with each + population (1 = single-replica fallback, ≥2 = multi-replica cluster, + 0 = no slots available, exit cleanly). + + Slots are tried in randomized order so allocation is decorrelated + from invocation seed / wall clock. Every claimed flock is released + when the context exits — normally or via exception — so a crashing + driver doesn't strand any slot. + + All parallel-workload driver invocations share the workload + container's filesystem, so plain flock on a tmpfs path under + `POOL_SLOT_LOCK_DIR` is sufficient serialization (no cross-container + coordination required). """ try: os.makedirs(POOL_SLOT_LOCK_DIR, exist_ok=True) except OSError as exc: LOG.warning("pool slot lock dir %s unavailable: %s", POOL_SLOT_LOCK_DIR, exc) - yield None + yield [] return slots = list(range(CLUSTERD_POOL_SIZE)) rng.shuffle(slots) - for slot in slots: - path = os.path.join(POOL_SLOT_LOCK_DIR, f"{slot}.lock") - fd = os.open(path, os.O_RDWR | os.O_CREAT, 0o600) - try: - fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB) - except OSError: - # Another invocation owns this slot; try the next one. - os.close(fd) - continue - try: - yield slot - finally: + held: list[tuple[int, int]] = [] # (slot, fd) + try: + for slot in slots: + if len(held) >= desired: + break + path = os.path.join(POOL_SLOT_LOCK_DIR, f"{slot}.lock") + fd = os.open(path, os.O_RDWR | os.O_CREAT, 0o600) + try: + fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB) + except OSError: + os.close(fd) + continue + held.append((slot, fd)) + yield [slot for slot, _ in held] + finally: + for _, fd in held: try: fcntl.flock(fd, fcntl.LOCK_UN) finally: os.close(fd) - return - LOG.warning("all %d pool slots are claimed; running without isolation", CLUSTERD_POOL_SIZE) - yield None def _drop_seed_scoped_objects(seed: str) -> None: @@ -482,50 +491,54 @@ def main() -> int: _prepare_system(NUM_THREADS) - # Claim one clusterd-pool-{i} container for this invocation. The flock - # is held until main() returns; concurrent invocations inside the - # workload container can't pick the same slot. If every slot is busy - # the context manager yields `None` — we tag that with a sometimes() - # for visibility and exit cleanly (the property surface for this - # invocation just doesn't get exercised). + # Claim up to DESIRED_REPLICAS pool slots; the cluster runs with as + # many replicas as we got (≥1). Locks are held until main() returns; + # if no slot is free we tag a sometimes() and exit cleanly. # - # Each parallel-workload cluster lands on its own clusterd-pool-{slot} - # container, giving Antithesis per-cluster fault isolation. Without - # this, every parallel-workload cluster would be a child process of - # environmentd under the materialized container's process orchestrator, - # and the only container-level fault would be "the whole world". - with _claim_pool_slot(rng) as pool_slot: + # Each replica lands on its own clusterd-pool-{slot} container, so + # Antithesis can fault one replica's container without taking the + # cluster offline — exercises the same multi-replica recovery paths + # `antithesis_cluster` covers, but on the workload-driven cluster. + with _claim_pool_slots(rng, DESIRED_REPLICAS) as pool_slots: + sometimes( + len(pool_slots) > 0, + "parallel workload: clusterd pool slots claimed", + {"pool_size": CLUSTERD_POOL_SIZE, "claimed": len(pool_slots)}, + ) sometimes( - pool_slot is not None, - "parallel workload: clusterd pool slot claimed", - {"pool_size": CLUSTERD_POOL_SIZE}, + len(pool_slots) >= DESIRED_REPLICAS, + "parallel workload: full multi-replica pool claim", + {"pool_size": CLUSTERD_POOL_SIZE, "desired": DESIRED_REPLICAS}, ) - if pool_slot is None: + if not pool_slots: LOG.info( - "parallel-workload exiting cleanly: no pool slot available " + "parallel-workload exiting cleanly: no pool slots available " "(pool_size=%d)", CLUSTERD_POOL_SIZE, ) return 0 - pool_member = ClusterdPoolMember( - host=f"clusterd-pool-{pool_slot}", - workers=CLUSTERD_POOL_WORKERS, - ) + pool_members = [ + ClusterdPoolMember( + host=f"clusterd-pool-{slot}", + workers=CLUSTERD_POOL_WORKERS, + ) + for slot in pool_slots + ] LOG.info( - "parallel-workload claimed pool slot %d (%s)", - pool_slot, - pool_member.host, + "parallel-workload claimed %d pool slot(s): %s", + len(pool_slots), + ", ".join(m.host for m in pool_members), ) - return _run_invocation(seed, rng, pool_member) + return _run_invocation(seed, rng, pool_members) def _run_invocation( seed: str, rng: random.Random, - pool_member: ClusterdPoolMember, + pool_members: list[ClusterdPoolMember], ) -> int: - """The bulk of `main()` once a pool slot has been claimed. Split out so - the slot lock stays held across this whole call: the lock is released + """The bulk of `main()` once pool slot(s) have been claimed. Split out + so the slot locks stay held across this whole call: they are released when the enclosing `with` block in `main()` exits. """ @@ -537,10 +550,10 @@ def _run_invocation( # concurrent invocations share the SUT — see _SETUP_RACE_PATTERNS for # the fallback when they collide anyway. # - # `pool_members=[pool_member]` puts this invocation's single cluster - # on the pool member above; the framework forces managed=False and - # emits unmanaged CREATE CLUSTER with explicit STORAGECTL/COMPUTE - # ADDRESSES. + # `pool_members=pool_members` makes a single unmanaged cluster with one + # replica per member; the framework forces managed=False and emits + # unmanaged CREATE CLUSTER with explicit STORAGECTL/COMPUTE ADDRESSES + # for each replica. database = Database( rng=rng, seed=seed, @@ -556,7 +569,7 @@ def _run_invocation( scenario=Scenario.Kill, naughty_identifiers=False, seed_scoped_names=True, - pool_members=[pool_member], + pool_members=pool_members, ) end_time = time.time() + RUNTIME_S From bb766ee19283c8cf56ff529a22cafbeca7434985 Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Wed, 13 May 2026 19:42:23 -0400 Subject: [PATCH 47/65] test/antithesis: tolerate Antithesis fault-injection errors in parallel-workload setup and worker phases MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three fault-injection shapes were escaping the parallel-workload driver as 'unexpected' SQL errors and tripping the always() assertion: 1. SETUP: "Failed to resolve hostname" during CREATE CONNECTION FOR KAFKA — materialized's broker-validation can't reach `kafka` because Antithesis paused the kafka container or partitioned DNS. 2. SETUP: "connection timeout expired" / "server closed the connection unexpectedly" from the driver's psycopg.connect to `materialized:6875` — Antithesis paused the materialized container during setup. 3. WORKER: "thread pw-worker-N exited early" with no captured cause — `Worker.run`'s initial psycopg.connect / websocket / SET statements run outside any try/except, so a fault that lands during worker startup kills the thread without populating `occurred_exception`. The driver's worker_failed payload then reports just the thread-exit-early string with no underlying cause. None of these are SUT correctness issues — they're the expected cost of fault injection landing at inconvenient moments. The fix adds a second tolerance category, _SETUP_FAULT_PATTERNS, alongside the existing _SETUP_RACE_PATTERNS, and: * Inside _tolerate_setup_race: swallow per-statement exceptions whose message matches either pattern set. * Around the whole setup phase: if setup_failure matches, demote out of the always() assertion and into a sometimes() for visibility. * Around worker thread death: if occurred_exception is None (fault-killed startup) or its message matches a fault pattern, demote out of the always() assertion and into a sometimes(). A worker that captures a non-fault exception still fails the run as before. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../test/parallel_driver_parallel_workload.py | 120 ++++++++++++++++-- 1 file changed, 111 insertions(+), 9 deletions(-) diff --git a/test/antithesis/workload/test/parallel_driver_parallel_workload.py b/test/antithesis/workload/test/parallel_driver_parallel_workload.py index 5929f88327590..edbce1b622636 100644 --- a/test/antithesis/workload/test/parallel_driver_parallel_workload.py +++ b/test/antithesis/workload/test/parallel_driver_parallel_workload.py @@ -233,26 +233,96 @@ def _prepare_system(num_threads: int) -> None: "object state changed while transaction was in progress", ) +# Substring matches for setup-phase errors caused by Antithesis fault +# injection rather than workload misuse. Antithesis can pause any container +# (materialized, kafka, postgres-metadata, minio) at any time; if a pause +# lands during the driver's setup phase we see one of these shapes. The +# distinction matters because the always() assertion at the bottom of main() +# treats setup failures as unexpected by default — a connection timeout to +# materialized or a hostname-resolve failure for kafka is fault timing, not +# a correctness bug to fail the run on. +# +# "connection timeout" — driver's psycopg.connect to materialized +# timed out (materialized container paused). +# "Multiple connection attempts failed" +# — same shape, retry-exhaustion wording. +# "Failed to resolve hostname"— materialized's CREATE CONNECTION +# validation can't resolve `kafka` (kafka +# container paused or DNS path partitioned). +# "connection refused" — target is up but socket closed; transient. +# "connection reset" — TCP reset during a fault. +# "broken pipe" — write to a socket the peer closed. +# "EOF detected" — psycopg's wording for peer-closed during +# query. +# "server closed the connection unexpectedly" +# — common psycopg flavour. +_SETUP_FAULT_PATTERNS = ( + "connection timeout", + "Multiple connection attempts failed", + "Failed to resolve hostname", + "connection refused", + "connection reset", + "broken pipe", + "EOF detected", + "server closed the connection unexpectedly", +) + + +def _matches_setup_tolerance(exc: BaseException) -> bool: + """True if `exc` is a setup-phase error we expect to see under either + concurrent-driver races or Antithesis fault injection. Used both inside + `_tolerate_setup_race` (to swallow per-statement) and around the whole + setup phase (to demote setup_failure from unexpected to a sometimes + signal). + """ + msg = getattr(exc, "msg", None) or str(exc) + return any( + pat in msg for pat in (*_SETUP_RACE_PATTERNS, *_SETUP_FAULT_PATTERNS) + ) + + +def _worker_death_tolerable(occurred: Exception | None) -> bool: + """True when an early-exiting worker thread is plausibly a fault-injection + casualty rather than a bug to fail the run on. + + `parallel_workload.worker.Worker.run` performs its initial + `psycopg.connect` / websocket / `SET` statements outside any try/except, + so a fault that lands during worker startup kills the thread with + `occurred_exception = None` (no captured cause). Once the worker is + inside its main action loop, captured `QueryError`s that don't match + `errors_to_ignore` populate `occurred_exception` — those are the ones + we want to look at. If the captured exception matches a fault shape + (`_SETUP_FAULT_PATTERNS`) it's still the fault that killed the worker, + not a SUT correctness bug. + """ + if occurred is None: + return True + return _matches_setup_tolerance(occurred) + def _tolerate_setup_race(fn, *args, **kwargs): - """Run `fn(...)`, swallowing the concurrent-race messages in - `_SETUP_RACE_PATTERNS` and propagating anything else. + """Run `fn(...)`, swallowing the messages in `_SETUP_RACE_PATTERNS` or + `_SETUP_FAULT_PATTERNS` and propagating anything else. The setup phase is invoked by every parallel-driver invocation, and the framework picks deterministic object names from a small pool. Concurrent - invocations therefore race to drop-then-create the same names; any single - race outcome is fine because the per-invocation Database object only - needs its named objects to exist by the time worker threads start. + invocations therefore race to drop-then-create the same names; any + single race outcome is fine because the per-invocation Database object + only needs its named objects to exist by the time worker threads start. + + Fault-induced errors (container paused, DNS partitioned, socket reset) + are absorbed for the same reason: they're expected under Antithesis, + not workload bugs. """ try: return fn(*args, **kwargs) except QueryError as exc: - if any(pat in (exc.msg or "") for pat in _SETUP_RACE_PATTERNS): + if _matches_setup_tolerance(exc): LOG.debug("setup tolerated: %s — %s", exc.query, exc.msg) return None raise except Exception as exc: # noqa: BLE001 - if any(pat in str(exc) for pat in _SETUP_RACE_PATTERNS): + if _matches_setup_tolerance(exc): LOG.debug("setup tolerated: %s", exc) return None raise @@ -649,10 +719,42 @@ def _run_invocation( {"ignored_errors": total_ignored}, ) + # Setup-phase failures whose message matches `_SETUP_*_PATTERNS` are + # either concurrent-driver races or Antithesis fault-injection + # consequences (paused container, partitioned DNS, reset socket). + # Neither is a SUT correctness issue, so demote them out of the + # `always(...)` assertion and into a `sometimes(...)` for visibility. + setup_tolerated = setup_failure is not None and _matches_setup_tolerance( + setup_failure + ) + sometimes( + setup_tolerated, + "parallel workload: setup phase tolerated a fault-injection or race error", + {"error": str(setup_failure) if setup_failure else None}, + ) + + # Worker-thread death under fault injection has the same + # "expected-not-a-bug" shape: an uncaptured-exception death (typically + # initial psycopg.connect failing because materialized was paused) or a + # captured exception whose message matches a fault pattern. + worker_tolerated = worker_failed is not None and _worker_death_tolerable( + worker_failed.cause + ) + sometimes( + worker_tolerated, + "parallel workload: worker thread death tolerated as fault-injection consequence", + { + "error": ( + str(worker_failed.cause) if worker_failed and worker_failed.cause else None + ), + "uncaptured": worker_failed is not None and worker_failed.cause is None, + }, + ) + unexpected = None - if setup_failure is not None: + if setup_failure is not None and not setup_tolerated: unexpected = {"phase": "setup", "error": str(setup_failure)} - elif worker_failed is not None: + elif worker_failed is not None and not worker_tolerated: unexpected = { "phase": "worker", "error": ( From ff76a2767da2f76672d64258389ac1a019ae9199 Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Thu, 14 May 2026 10:31:56 -0400 Subject: [PATCH 48/65] test/antithesis: pivot pool design to permanent pool clusters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous design (CREATE+DROP CLUSTER per parallel-workload invocation, targeting a pool clusterd) hit a hard clusterd halt on the second invocation against the same slot: WARN: halting process: new instance configuration not compatible with existing instance configuration: ... index_logs: IntrospectionSourceIndex(...876897) vs Some(IntrospectionSourceIndex(...876641)) InstanceConfig::compatible_with compares LoggingConfig, which includes the per-cluster IntrospectionSourceIndex GlobalIds — those are freshly allocated by every CREATE CLUSTER. Pointing a different cluster identity at a clusterd that already saw a prior cluster's introspection indexes trips this check. reconcile() handles environmentd restarts against the same cluster identity, but not cluster-identity changes. Pivot to permanent pool clusters: one long-lived pool_cluster_ per clusterd-pool-, bootstrapped by the workload-entrypoint once at compose-up and never dropped. Each parallel-workload invocation picks a slot at random and runs against the pool cluster for that slot. The cluster identity stays stable per slot, so the only reconnect events the pool clusterds see are environmentd restarts and Antithesis-injected pauses of the pool clusterd itself — the path reconcile is designed for. Concurrent invocations may share a pool cluster: every workload object is in a seed-scoped database (db-pw--*) with seed-scoped roles, so DDL/DML never collides across invocations. No coordination required — no fcntl.flock, no slot-claim contextmanager, no "no slot available, exit cleanly" fallback. Antithesis still faults containers, not invocations, so the per-container fault domain is preserved; two invocations witnessing the same fault simply give us more independent reproductions per failure. Framework changes: * Cluster.pre_existing_name: when set, create()/drop() are no-ops, name() returns the literal, is_pool_backed flips True for action skips. * Database.existing_cluster_name: replaces pool_members on the Antithesis path; wraps one pre-existing cluster. * ClusterReplica loses its pool_member field (no longer used). * action.CreateClusterAction checks existing_cluster_name (was pool_members). Driver changes: * Slot pick is rng.randrange(CLUSTERD_POOL_SIZE). No coordination. * _drop_seed_scoped_objects stops dropping clusters. The seed-scoped database drop cascades through every workload-created object on the cluster, returning the bound clusterd to an idle baseline. * Setup-phase pre-create sweep no longer touches mz_clusters. mzcompose / entrypoint changes: * Workload service env now exports ANTITHESIS_CLUSTERD_POOL_SIZE + CLUSTERD_POOL_SIZE so the bootstrap script and the driver agree on the slot count. * workload-entrypoint.sh loops POOL_SIZE times and CREATEs each pool_cluster_ on its clusterd-pool- (idempotent across compose-up). Multi-replica parallel-workload coverage is gone in this iteration — each pool cluster has one replica. Multi-replica coverage stays in antithesis_cluster; a future revision could pair pool clusterds into 2-replica pool clusters at the cost of doubling the pool footprint. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../materialize/parallel_workload/action.py | 22 +- .../materialize/parallel_workload/database.py | 139 +++++------ test/antithesis/config/docker-compose.yaml | 2 + test/antithesis/mzcompose.py | 6 + .../parallel-workload-fault-isolation.md | 217 +++++++++-------- .../test/parallel_driver_parallel_workload.py | 223 ++++++------------ .../workload/workload-entrypoint.sh | 40 ++++ 7 files changed, 297 insertions(+), 352 deletions(-) diff --git a/misc/python/materialize/parallel_workload/action.py b/misc/python/materialize/parallel_workload/action.py index 89d9c0801fc76..60161d9d1fe74 100644 --- a/misc/python/materialize/parallel_workload/action.py +++ b/misc/python/materialize/parallel_workload/action.py @@ -2026,12 +2026,12 @@ def run(self, exe: Executor) -> bool: class CreateClusterAction(Action): def run(self, exe: Executor) -> bool: - # In pool mode the Database's clusters are wired to pre-existing - # clusterd containers from a finite pool the caller passed in. - # Dynamically creating a new cluster would need to claim an unused - # pool member, and we don't have an allocator. Skip — the initial - # clusters set up at construction time are the test surface. - if exe.db.pool_members is not None: + # In existing-cluster mode the Database wraps a pre-existing + # (caller-supplied) cluster, typically bootstrapped by the + # Antithesis compose, and we have no allocator for additional + # clusters tied to other pool members. Skip — the wrapped + # cluster is the entire test surface. + if exe.db.existing_cluster_name is not None: return False with exe.db.lock: if len(exe.db.clusters) >= MAX_CLUSTERS: @@ -2178,9 +2178,8 @@ def run(self, exe: Executor) -> bool: with exe.db.lock: # Keep cluster 0 with 1 replica for sources/sinks unmanaged_clusters = [c for c in exe.db.clusters[1:] if not c.managed] - # Pool-backed clusters can't grow their replica count — there's - # no pool allocator handing out a fresh ClusterdPoolMember per - # ALTER CLUSTER ADD REPLICA. Skip them. + # Pre-existing (pool) clusters: the framework didn't create them + # and won't mutate them. Skip. unmanaged_clusters = [ c for c in unmanaged_clusters if not c.is_pool_backed ] @@ -2207,10 +2206,7 @@ def run(self, exe: Executor) -> bool: with exe.db.lock: # Keep cluster 0 with 1 replica for sources/sinks unmanaged_clusters = [c for c in exe.db.clusters[1:] if not c.managed] - # Pool-backed clusters can't shrink either — without an - # allocator to release the pool member back, the in-memory - # model would diverge from materialize's catalog and later - # creates targeting the freed slot would conflict. + # Pre-existing (pool) clusters: same reasoning as above. Skip. unmanaged_clusters = [ c for c in unmanaged_clusters if not c.is_pool_backed ] diff --git a/misc/python/materialize/parallel_workload/database.py b/misc/python/materialize/parallel_workload/database.py index be639f9145f82..7034be033dae8 100644 --- a/misc/python/materialize/parallel_workload/database.py +++ b/misc/python/materialize/parallel_workload/database.py @@ -915,16 +915,17 @@ def create(self, exe: Executor) -> None: @dataclasses.dataclass(frozen=True) class ClusterdPoolMember: - """One entry in an external clusterd pool that a `Cluster` can target as - an unmanaged replica. + """Address+config of one external clusterd container the SUT will host an + unmanaged cluster replica on. - Used by callers (Antithesis parallel-driver) that want fault-isolation - per cluster: each pool member is its own container, so Antithesis can - kill/pause/partition exactly one cluster's storage+compute without - taking down the other clusters that share the materialized container's - process orchestrator. + Used by the Antithesis compose bootstrap (see test/antithesis/mzcompose.py) + to build the CREATE CLUSTER REPLICAS clause for each long-lived pool + cluster: one cluster per pool member, with this member as its sole + replica. After bootstrap the framework only references the cluster by + name (`existing_cluster_name`); pool members aren't passed into + `Database` directly. - The default ports match clusterd's defaults; override per environment. + Default ports match clusterd's defaults; override per environment. """ host: str @@ -941,26 +942,18 @@ class ClusterReplica: cluster: "Cluster" rename: int lock: threading.Lock - # When non-None, the replica is wired to a pre-existing clusterd - # container via unmanaged-cluster syntax (STORAGECTL/COMPUTE ADDRESSES) - # rather than provisioned through the orchestrator. The replica's - # `size` field is ignored in that case; `pool_member.workers` provides - # the WORKERS clause. - pool_member: ClusterdPoolMember | None def __init__( self, replica_id: int, size: str, cluster: "Cluster", - pool_member: ClusterdPoolMember | None = None, ): self.replica_id = replica_id self.size = size self.cluster = cluster self.rename = 0 self.lock = threading.Lock() - self.pool_member = pool_member def name(self) -> str: if self.rename: @@ -992,6 +985,12 @@ class Cluster: # parallel-driver, where many concurrent Database instances against one # materialize would otherwise collide on the same `cluster-N` names. name_scope: str + # When set, the cluster represents a pre-existing cluster the framework + # did not create and must not drop. `name()` returns this literally + # (bypassing cluster_id / rename / name_scope), and `create()` / `drop()` + # are no-ops. The replicas list is empty in this mode — the framework + # doesn't model the pre-existing replicas because it never touches them. + pre_existing_name: str | None def __init__( self, @@ -1001,26 +1000,20 @@ def __init__( replication_factor: int, introspection_interval: str, name_scope: str = "", - pool_members: list[ClusterdPoolMember] | None = None, + pre_existing_name: str | None = None, ): self.cluster_id = cluster_id self.managed = managed self.size = size - # When `pool_members` is supplied, the cluster runs in unmanaged mode - # against one pre-existing clusterd container per replica. We force - # `managed=False` (the unmanaged-cluster syntax is what carries the - # STORAGECTL/COMPUTE ADDRESSES clauses) and ignore `replication_factor` - # in favour of `len(pool_members)`. - if pool_members is not None: - if not pool_members: - raise ValueError( - "pool_members must be non-empty when provided; one member per replica" - ) + self.pre_existing_name = pre_existing_name + if pre_existing_name is not None: + # Pre-existing cluster: framework only models its name. The actual + # replicas live in materialize's catalog from the bootstrap step + # that created the cluster (see test/antithesis/mzcompose.py). + # Empty replicas list flips `is_pool_backed` to True, which is + # what the action classes use to skip DDL on this cluster. self.managed = False - self.replicas = [ - ClusterReplica(i, size, self, pool_member=pool_members[i]) - for i in range(len(pool_members)) - ] + self.replicas = [] else: self.replicas = [ ClusterReplica(i, size, self) for i in range(replication_factor) @@ -1033,13 +1026,19 @@ def __init__( @property def is_pool_backed(self) -> bool: - """True iff every replica is wired to a pre-existing clusterd - container rather than provisioned through the orchestrator. Action - classes that would mutate replica count check this and bail — - we don't dynamically allocate from the pool.""" - return all(r.pool_member is not None for r in self.replicas) + """True for clusters the framework didn't create itself and won't + mutate (replica count, drop). Currently set when `pre_existing_name` + was passed in. Action classes that would CREATE/ALTER/DROP REPLICA + check this and bail.""" + return self.pre_existing_name is not None def name(self) -> str: + # Pre-existing clusters: name is fixed by the caller (typically a + # pool-cluster the Antithesis compose bootstrapped). Don't apply + # naughtify / name_scope / rename — they don't apply to objects we + # didn't create. + if self.pre_existing_name is not None: + return self.pre_existing_name # Format: `cluster[-{name_scope}]-{cluster_id}[-{rename}]`. The # bracketed `-{name_scope}` segment is only present when seed- # scoping is on, so the historical `cluster-0` / `cluster-0-1` @@ -1055,29 +1054,13 @@ def __str__(self) -> str: return identifier(self.name()) def create(self, exe: Executor) -> None: + # Pre-existing cluster: the SUT already has it (bootstrapped at + # compose-up). The framework's only responsibility for the cluster + # is to use its name; never DDL it. + if self.pre_existing_name is not None: + return query = f"CREATE CLUSTER {self} " - if self.is_pool_backed: - # Unmanaged cluster pointing at pre-existing clusterd containers. - # Each replica gets the STORAGECTL/STORAGE/COMPUTECTL/COMPUTE - # ADDRESSES of its pool member; WORKERS comes from the pool - # member's config. Requires - # `unsafe_enable_unorchestrated_cluster_replicas = true` on the - # SUT (see test/antithesis/mzcompose.py for the Antithesis case). - replica_specs = [] - for replica in self.replicas: - assert replica.pool_member is not None - m = replica.pool_member - replica_specs.append( - f"{replica} (" - f"STORAGECTL ADDRESSES ['{m.host}:{m.storagectl_port}'], " - f"STORAGE ADDRESSES ['{m.host}:{m.storage_port}'], " - f"COMPUTECTL ADDRESSES ['{m.host}:{m.computectl_port}'], " - f"COMPUTE ADDRESSES ['{m.host}:{m.compute_port}'], " - f"WORKERS {m.workers}" - f")" - ) - query += "REPLICAS(" + ", ".join(replica_specs) + ")" - elif self.managed: + if self.managed: query += f"SIZE = '{self.size}', REPLICATION FACTOR = {len(self.replicas)}, INTROSPECTION INTERVAL = '{self.introspection_interval}'" else: query += "REPLICAS(" @@ -1145,13 +1128,13 @@ def __init__( # qualified by DB.name() which includes the seed, so they don't # need this. seed_scoped_names: bool = False, - # When non-None, every cluster the Database creates uses the - # external clusterd-pool backend (unmanaged-with-explicit-addresses) - # rather than the orchestrator. The Database slices this list one - # member per replica across its clusters at construction time. - # See `ClusterdPoolMember` for the shape; sized to fit the - # database's initial cluster + replica plan. - pool_members: list[ClusterdPoolMember] | None = None, + # When set, the Database runs against a pre-existing cluster the + # framework didn't create and won't drop. CreateClusterAction is + # disabled in this mode; the single initial cluster wraps the + # supplied name. Used by the Antithesis parallel-driver to bind + # each invocation to one of the long-lived pool clusters that the + # compose creates at bootstrap (see test/antithesis/mzcompose.py). + existing_cluster_name: str | None = None, ): self.host = host self.ports = ports @@ -1159,7 +1142,7 @@ def __init__( self.scenario = scenario self.seed = seed self.seed_scoped_names = seed_scoped_names - self.pool_members = pool_members + self.existing_cluster_name = existing_cluster_name # The bare seed (no leading/trailing punctuation) used by Cluster / # Role / etc. to assemble their scoped names. Empty when seed-scoping # is off, in which case those classes fall back to their historical @@ -1205,24 +1188,24 @@ def __init__( ] self.role_id = len(self.roles) # At least one storage cluster required for WebhookSources. - # In pool mode, the entire `pool_members` list is consumed by a - # single unmanaged cluster — one replica per member — so the - # caller controls both replica count and pool-member identity. - # This is the only initial cluster; CreateClusterAction is - # disabled in pool mode (no in-band allocator). - if pool_members is not None: + # In existing-cluster mode the framework's sole initial cluster + # wraps a pre-existing cluster (typically a pool cluster the + # Antithesis compose bootstrapped). The wrapper's create()/drop() + # are no-ops; CreateClusterAction / CreateClusterReplicaAction / + # DropClusterReplicaAction are also disabled for it. + if existing_cluster_name is not None: self.clusters = [ Cluster( 0, - # managed/size are ignored when pool-backed but kept as - # placeholder values for any code that reads them - # without consulting `is_pool_backed`. + # managed / size / replication_factor are ignored when + # `pre_existing_name` is set — the wrapper never emits + # CREATE CLUSTER. managed=False, - size=pool_members[0].host, - replication_factor=len(pool_members), + size="", + replication_factor=1, introspection_interval="1s", name_scope=self.name_scope, - pool_members=pool_members, + pre_existing_name=existing_cluster_name, ) ] else: diff --git a/test/antithesis/config/docker-compose.yaml b/test/antithesis/config/docker-compose.yaml index 900b586870e75..e1439653361bb 100644 --- a/test/antithesis/config/docker-compose.yaml +++ b/test/antithesis/config/docker-compose.yaml @@ -828,6 +828,8 @@ services: - KAFKA_BROKER=kafka:9092 - SCHEMA_REGISTRY_URL=http://schema-registry:8081 - MZ_ANTITHESIS_CLUSTER=antithesis_cluster + - ANTITHESIS_CLUSTERD_POOL_SIZE=8 + - CLUSTERD_POOL_SIZE=8 - MYSQL_HOST=mysql - MYSQL_REPLICA_HOST=mysql-replica - MYSQL_PASSWORD=p@ssw0rd diff --git a/test/antithesis/mzcompose.py b/test/antithesis/mzcompose.py index d66c63eb3348f..2a75dce53b4e0 100644 --- a/test/antithesis/mzcompose.py +++ b/test/antithesis/mzcompose.py @@ -88,6 +88,12 @@ def __init__(self) -> None: # Name of the unmanaged cluster the workload-entrypoint # provisions against clusterd1 before emitting setup-complete. "MZ_ANTITHESIS_CLUSTER=antithesis_cluster", + # Pool size for the long-lived `pool_cluster_{i}` clusters + # the entrypoint bootstraps. Mirrored to the parallel- + # workload driver (CLUSTERD_POOL_SIZE) so they agree on the + # slot count. + f"ANTITHESIS_CLUSTERD_POOL_SIZE={CLUSTERD_POOL_SIZE}", + f"CLUSTERD_POOL_SIZE={CLUSTERD_POOL_SIZE}", # MySQL primary and replica connection details. "MYSQL_HOST=mysql", "MYSQL_REPLICA_HOST=mysql-replica", diff --git a/test/antithesis/scratchbook/parallel-workload-fault-isolation.md b/test/antithesis/scratchbook/parallel-workload-fault-isolation.md index f9e182c043ab5..884a030b704db 100644 --- a/test/antithesis/scratchbook/parallel-workload-fault-isolation.md +++ b/test/antithesis/scratchbook/parallel-workload-fault-isolation.md @@ -27,12 +27,19 @@ environmentd's process orchestrator and the fault domain disappears. ## Solution A pool of identical pre-deployed clusterd containers -(`clusterd-pool-{0..N-1}`). Each invocation claims up to -`PW_DESIRED_REPLICAS` (default 2) slots via filesystem locking and -provisions a single unmanaged cluster with one replica per claimed -slot, then releases the locks on exit. Best-effort: with N slots -claimed the cluster runs as an N-replica cluster (1 ≤ N ≤ desired); -no slots → exit cleanly. +(`clusterd-pool-{0..N-1}`) with a corresponding pool of long-lived +unmanaged clusters (`pool_cluster_{0..N-1}`), each bound to its slot's +clusterd. Pool clusters are bootstrapped once by the workload-entrypoint +and outlive every individual parallel-workload invocation. Each +parallel-workload invocation picks a slot at random and runs against +`pool_cluster_{slot}`. There is no coordination between concurrent +invocations: every workload object lives in a seed-scoped database +(`db-pw-{seed}-*`) with seed-scoped roles, so two invocations sharing a +pool cluster don't collide. Antithesis faults containers, not +invocations, so the per-container fault domain is preserved either way; +two invocations witnessing the same fault is a feature (more +independent reproductions per failure). The pool cluster itself is +never dropped. Components, bottom-up: @@ -43,121 +50,123 @@ Components, bottom-up: mem_env RocksDB (matches production, no scratch volume to fight over). Pool size from env (`ANTITHESIS_CLUSTERD_POOL_SIZE`, default 8). - - **`parallel_workload.Database(pool_members=..., + - **Pool-cluster bootstrap** in + `test/antithesis/workload/workload-entrypoint.sh`. After materialized + becomes healthy, the script loops over `0..POOL_SIZE-1` and issues + `CREATE CLUSTER pool_cluster_{i} REPLICAS (r1 (STORAGECTL ADDRESSES + ['clusterd-pool-{i}:2100'], ...))` for each pool member that doesn't + already exist. Idempotent across compose-up cycles. Once setup- + complete is emitted, every pool cluster is ready for the test + composer to start invoking the parallel-workload driver. + + - **`parallel_workload.Database(existing_cluster_name=..., seed_scoped_names=True)`**. Opt-in framework mode: when - `pool_members` is set, the framework provisions one unmanaged - cluster with `len(pool_members)` replicas, each pointed at a pool - member via explicit STORAGECTL/STORAGE/COMPUTECTL/COMPUTE ADDRESSES - (in place of managed SIZE/REPLICATION FACTOR); the CreateCluster / - CreateReplica / DropReplica actions skip pool-backed clusters - because there is no in-band allocator. `seed_scoped_names=True` - renames `cluster{N}` / `role{N}` to `cluster-{seed}-{N}` / - `role-{seed}-{N}` so concurrent invocations don't collide on - global names. - - - **`_claim_pool_slots()`** in - `test/antithesis/workload/test/parallel_driver_parallel_workload.py`. - Contextmanager that holds up to `PW_DESIRED_REPLICAS` exclusive - `fcntl.flock`s on `/tmp/clusterd-pool-slots/{i}.lock` for the - lifetime of the invocation. Slots are tried in randomized order so - allocation is decorrelated from invocation seed. Every claimed lock - is released on context exit (normal or exception), so a crashing - driver doesn't strand any slot. + `existing_cluster_name` is set, the framework's single initial + cluster is a wrapper around the pre-existing cluster — `create()` + and `drop()` are no-ops, `is_pool_backed` is True (which gates the + CreateCluster / CreateReplica / DropReplica actions). `Cluster.name()` + returns the literal cluster name supplied by the caller, bypassing + the framework's normal `cluster-{seed}-{id}` shape. Roles still get + seed-scoped naming (`role-{seed}-{N}`) so concurrent invocations + don't collide on those. + + - **Slot pick** in + `test/antithesis/workload/test/parallel_driver_parallel_workload.py`: + `rng.randrange(CLUSTERD_POOL_SIZE)`. Stateless, no coordination, + no failure mode. Concurrent invocations may share a pool cluster + (see the no-collision argument above). - **`_drop_seed_scoped_objects()`** in the same driver, called in - `main()`'s `finally`. Drops every cluster / database / role whose - name starts with `cluster-{seed}-` / `db-pw-{seed}-` / - `role-{seed}-`. The DROP CLUSTER re-arms the clusterd to be - claimed by the next invocation through the reconcile path - (see below). - -## Clusterd reuse correctness - -The pool design assumes a DROP CLUSTER followed by a CREATE CLUSTER -pointed at the same clusterd is a supported transition. It is — this is -the same reconciliation path that handles environmentd restart. The -three pieces: - - 1. **Transport cancels the prior connection on every new connect.** - `src/service/src/transport.rs::serve` drops the old - connection-task token and awaits the task before installing a - fresh handler from `handler_fn()`. The new `ClusterClient` is a - blank-slate wrapper around the same `Arc>`. - - 2. **The worker `run` loop survives client disconnects.** - `src/storage/src/storage_state.rs::Worker::run` is - `while let Some((nonce, rx, tx)) = client_rx.blocking_recv() { - run_client(rx, tx); }`. When the old `cmd_tx` is dropped (because - the cancel above tore down the prior client), `run_client` returns - and the outer loop awaits the next `(nonce, rx, tx)` — the new - controller's connection. Worker in-memory state stays resident - between connections. - - 3. **`reconcile()` drops stale state.** The new controller's first - batch of commands ending in `InitializationComplete` is processed - by `storage_state::reconcile`: it computes `expected_objects` from - the new commands, identifies `stale_objects` as anything the - worker knows about that the new controller did not ask for, and - `drop_collection`s each one — releasing source tokens (which tears - down Kafka consumers, persist write handles, upsert RocksDB state), - dropping dataflows, clearing reported frontiers. - -Collection IDs do not collide across cluster lifetimes because -Materialize allocates them globally (`u`, `t`), not per cluster. - -The one piece intentionally shared across reconnects is the -`Arc`. It is keyed by URL+credentials, not by -cluster identity, and reusing it is the standard production behavior -(avoids reauthenticating to S3 / postgres-metadata on every reconnect). - -The same analysis holds for the compute side (`src/compute/src/server.rs` -uses the same `ClusterSpec` pattern). + `main()`'s `finally`. Drops every database and role whose name + starts with `db-pw-{seed}-` / `role-{seed}-`. **Pool clusters are + NOT dropped** — they're permanent state shared across invocations. + The DROP DATABASE CASCADE transitively drops every workload-created + table / MV / index / source / sink, which tears down the + corresponding dataflows on the bound clusterd container, so the + cluster returns to an idle baseline before the next claimant. + +## Why pool clusters must be permanent: the clusterd-reuse constraint + +The first iteration of this design dropped and recreated the parallel- +workload cluster on every invocation. That failed on the second +invocation against the same pool slot with a clusterd halt: + +> `WARN ...: halting process: new instance configuration not compatible +> with existing instance configuration: ... index_logs: +> {Timely(Operates): IntrospectionSourceIndex(144115188075856897), ...} +> vs Some(... IntrospectionSourceIndex(144115188075856641), ...)` + +The check is `InstanceConfig::compatible_with` in +`src/compute-client/src/protocol/command.rs`. It compares `LoggingConfig` +including `index_logs: BTreeMap`. +Those introspection-source-index IDs are per-cluster catalog allocations +— every CREATE CLUSTER produces a fresh batch. Pointing a *different* +cluster identity at a clusterd that already saw a prior cluster's +introspection indexes trips this check and the clusterd halts on the +first `CreateInstance` command. + +Reconcile (`storage_state::reconcile`, `compute::server`) handles the +case where the *same* cluster reconnects after an environmentd restart: +the worker drops stale collections, takes the new commands, and resumes. +But it does not handle the case where a different cluster claims the +clusterd, because the introspection indexes don't match. + +Pinning cluster identity to clusterd identity — one permanent pool +cluster per pool clusterd container — sidesteps the check entirely. The +only reconnect events the pool clusterds see across the lifetime of a +compose are environmentd restarts (and Antithesis-injected pauses / +restarts of the pool clusterd itself), both of which exercise the same +cluster identity reconnecting. That's the path reconcile is designed for. ## Failure modes - - **All pool slots held.** Driver tags `sometimes(...)` for - visibility and exits cleanly. With the default pool size (8) and - the test composer's normal concurrency this is not expected to - fire, but if it does we'll see it in the run report. - - - **Crash before drop-on-exit runs.** The flock is released - automatically when the process dies (kernel-level lock release). - The clusterd is left holding stale state until the next claimant - reconciles. Catalog leftovers (`cluster-{seed}-*`, - `role-{seed}-*`, `db-pw-{seed}-*`) accumulate until the next - invocation with the same seed runs its setup sweep — extremely - unlikely since seeds are u64-random. The setup sweep is scoped - to the current seed only, so it does not clean cross-invocation - leftovers. A periodic external cleanup or a startup-time scan - against `mz_clusters` / `mz_roles` / `mz_databases` would be - needed to close this loop properly. For now the catalog growth - is bounded by run length and not currently a problem. - - - **Pool sizing wrong vs concurrency.** If concurrency exceeds pool - size, the late arrivals get "no slot" and exit. We do not currently - auto-tune; bump `ANTITHESIS_CLUSTERD_POOL_SIZE` if telemetry shows - the "no slot available" signal firing. + - **Crash before drop-on-exit runs.** The seed-scoped database and + roles are left in the catalog until they're explicitly cleaned up. + Catalog leftovers do not break correctness (each seed is u64-random, + no cross-invocation collisions) but they accumulate. The next + invocation that lands on the same pool cluster will inherit MVs / + indexes / sources still rendered on the bound clusterd from the + crashed invocation, which is more state pressure than a clean + handoff. A periodic / startup-time sweep against `mz_databases` / + `mz_roles` would close this; deferred until it shows up as a + problem. + + - **Pool size much smaller than concurrency.** With C concurrent + invocations and N pool slots, ~C/N invocations share each cluster + in steady state. That's correctness-preserving but increases + per-cluster state pressure linearly with the ratio. Bump + `ANTITHESIS_CLUSTERD_POOL_SIZE` if a single pool cluster runs hot. ## v1 limitations (future work) + - **Single-replica pool clusters.** Each pool cluster has one replica + (one clusterd container per cluster), so parallel-workload + invocations don't exercise multi-replica compute/storage paths. + Multi-replica coverage stays in `antithesis_cluster`. A future + revision could pair clusterd containers into 2-replica pool + clusters at the cost of doubling the pool footprint per + concurrency unit. + - **No in-band allocator inside the framework.** Worker threads - cannot grab additional pool members mid-run, so + cannot grab additional pool clusters mid-run, so `CreateClusterAction` / `CreateClusterReplicaAction` / `DropClusterReplicaAction` are skipped when pool-backed. The - framework only ever touches the pre-allocated pool members. + framework only ever touches the pre-existing pool cluster. - - **No global GC of cross-invocation catalog leftovers.** See - failure modes above. A first-invocation sweep against - `mz_clusters WHERE name LIKE 'cluster-%-%'` minus the current - seed would close this; deferred until it becomes a problem. + - **State accumulation on pool clusters.** Each pool cluster runs + through O(invocations) workload lifecycles over a long Antithesis + run. Even with seed-scoped DBs being dropped on exit, every pool + cluster's clusterd retains compute-side bookkeeping (catalog + state for introspection, peek_stash subscriptions, etc.). The + framework relies on `drop_collection` to release dataflow state; + if that path ever leaks, the pool cluster's memory footprint will + grow over many invocations. ## Tunables | Variable | Default | Effect | |---|---|---| -| `ANTITHESIS_CLUSTERD_POOL_SIZE` (compose) | 8 | Number of `clusterd-pool-{i}` containers deployed. | -| `CLUSTERD_POOL_SIZE` (driver) | 8 | Number of slots the driver will attempt to claim. Must match the compose value. | -| `CLUSTERD_POOL_SLOT_LOCK_DIR` (driver) | `/tmp/clusterd-pool-slots` | Directory holding the per-slot flock files. | -| `PW_DESIRED_REPLICAS` (driver) | 2 | Replicas to ask for per invocation's cluster. Best-effort: driver claims up to this many slots and runs with whatever it gets (≥1). | +| `ANTITHESIS_CLUSTERD_POOL_SIZE` (compose + entrypoint) | 8 | Number of clusterd-pool- containers deployed and matching pool_cluster_ clusters bootstrapped. | +| `CLUSTERD_POOL_SIZE` (driver) | 8 | Number of slots the driver chooses among. Mirrored from compose by mzcompose.py's Workload service so the two agree. | | `PW_RUNTIME_S` (driver) | 20 | Per-invocation runtime; bound to keep the fault-injection budget granular. | | `PW_THREADS` (driver) | 4 | Worker threads inside one invocation. | diff --git a/test/antithesis/workload/test/parallel_driver_parallel_workload.py b/test/antithesis/workload/test/parallel_driver_parallel_workload.py index edbce1b622636..427a0babc0f16 100644 --- a/test/antithesis/workload/test/parallel_driver_parallel_workload.py +++ b/test/antithesis/workload/test/parallel_driver_parallel_workload.py @@ -38,8 +38,6 @@ from __future__ import annotations -import contextlib -import fcntl import logging import os import random @@ -80,7 +78,6 @@ MAX_TABLES, MAX_VIEWS, MAX_WEBHOOK_SOURCES, - ClusterdPoolMember, Database, ) from materialize.parallel_workload.executor import Executor @@ -108,38 +105,24 @@ RUNTIME_S = float(os.environ.get("PW_RUNTIME_S", "20")) NUM_THREADS = int(os.environ.get("PW_THREADS", "4")) -# Number of clusterd-pool-{i} containers reserved for the parallel-workload -# driver. Must match the pool actually deployed in -# test/antithesis/mzcompose.py (ANTITHESIS_CLUSTERD_POOL_SIZE there → -# CLUSTERD_POOL_SIZE here). Each parallel-workload invocation claims -# slots via `fcntl.flock` (see `_claim_pool_slots`); the locks are held -# for the lifetime of the invocation so concurrent driver processes -# inside the workload container can't pick the same clusterd. +# Number of long-lived pool_cluster_ clusters the workload-entrypoint +# bootstrapped, one per clusterd-pool- container. Must match +# `ANTITHESIS_CLUSTERD_POOL_SIZE` in test/antithesis/mzcompose.py (the +# Workload service mirrors that value into the workload container's env +# as both ANTITHESIS_CLUSTERD_POOL_SIZE and CLUSTERD_POOL_SIZE so the +# bootstrap script and driver agree). +# +# Each invocation picks a pool slot at random and runs against the +# corresponding pool_cluster_. No coordination between concurrent +# invocations: two invocations may share a pool cluster — every workload +# object is in a seed-scoped database (`db-pw--*`) with seed-scoped +# roles, so DDL/DML never collides; the only shared state is the +# permanent pool cluster, which is purposefully shared. Antithesis still +# faults one container at a time, so the per-container fault domain is +# preserved; multiple invocations witnessing the same fault is a +# feature (more independent reproductions per failure). CLUSTERD_POOL_SIZE = int(os.environ.get("CLUSTERD_POOL_SIZE", "8")) -# Workers configured per clusterd-pool-{i} process. Must match the -# `Clusterd(..., workers=...)` argument in test/antithesis/mzcompose.py -# or the unmanaged CREATE CLUSTER REPLICA's `WORKERS` count will diverge -# from what clusterd actually runs. -CLUSTERD_POOL_WORKERS = 4 - -# Replicas to ask for per invocation's cluster. Best-effort: the driver -# claims up to this many pool slots and runs whatever it gets (≥1). With -# DESIRED_REPLICAS=2 and POOL_SIZE=8 we get multi-replica coverage for -# the parallel-workload cluster (currently only `antithesis_cluster` is -# multi-replica) when capacity allows, while degrading gracefully to a -# single-replica cluster under contention. -DESIRED_REPLICAS = int(os.environ.get("PW_DESIRED_REPLICAS", "2")) - -# Filesystem locks let concurrent parallel-workload invocations claim -# distinct clusterd-pool members without coordinating through the SUT. -# All invocations exec inside the single `workload` container so a -# regular flock on a tmpfs path is sufficient (no cross-container -# coordination required). -POOL_SLOT_LOCK_DIR = os.environ.get( - "CLUSTERD_POOL_SLOT_LOCK_DIR", "/tmp/clusterd-pool-slots" -) - def _alter_system(cur: psycopg.Cursor[Any], stmt: str) -> None: try: @@ -328,66 +311,21 @@ def _tolerate_setup_race(fn, *args, **kwargs): raise -@contextlib.contextmanager -def _claim_pool_slots(rng: random.Random, desired: int): - """Hold exclusive `fcntl.flock`s on up to `desired` pool-slot lockfiles - for the duration of the `with` block. Yields the list of claimed slot - indices (length 0–`desired`); the caller decides what to do with each - population (1 = single-replica fallback, ≥2 = multi-replica cluster, - 0 = no slots available, exit cleanly). - - Slots are tried in randomized order so allocation is decorrelated - from invocation seed / wall clock. Every claimed flock is released - when the context exits — normally or via exception — so a crashing - driver doesn't strand any slot. - - All parallel-workload driver invocations share the workload - container's filesystem, so plain flock on a tmpfs path under - `POOL_SLOT_LOCK_DIR` is sufficient serialization (no cross-container - coordination required). - """ - try: - os.makedirs(POOL_SLOT_LOCK_DIR, exist_ok=True) - except OSError as exc: - LOG.warning("pool slot lock dir %s unavailable: %s", POOL_SLOT_LOCK_DIR, exc) - yield [] - return - - slots = list(range(CLUSTERD_POOL_SIZE)) - rng.shuffle(slots) - held: list[tuple[int, int]] = [] # (slot, fd) - try: - for slot in slots: - if len(held) >= desired: - break - path = os.path.join(POOL_SLOT_LOCK_DIR, f"{slot}.lock") - fd = os.open(path, os.O_RDWR | os.O_CREAT, 0o600) - try: - fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB) - except OSError: - os.close(fd) - continue - held.append((slot, fd)) - yield [slot for slot, _ in held] - finally: - for _, fd in held: - try: - fcntl.flock(fd, fcntl.LOCK_UN) - finally: - os.close(fd) - - def _drop_seed_scoped_objects(seed: str) -> None: - """Drop everything this invocation's seed owns: its clusters, roles, and - databases. Called from `main()`'s finally so each invocation leaves the - catalog clean and frees its pool-slot's clusterd to be claimed by the - next driver run (DROP CLUSTER tears down the unmanaged replica → the - clusterd's existing controller connection ends → the next CREATE - CLUSTER pointed at the same address claims it via fresh reconcile). + """Drop everything this invocation's seed owns: its databases and roles. + + Pool clusters are NOT dropped — they're long-lived, bootstrapped by the + workload-entrypoint, and shared (one cluster per slot) across many + invocations. The DROP DATABASE CASCADE here transitively drops every + table / MV / index / source / sink the workload created on the pool + cluster during its run, which tears down the corresponding dataflows + on the bound clusterd container — so the cluster goes back to an idle + baseline before the next invocation claims the same slot. Errors here are logged and swallowed: leftover objects only cost a bit - of catalog footprint until the next invocation's setup sweep picks them - up. Don't let a cleanup failure turn into an assertion failure. + of catalog footprint until the next invocation with the same seed + re-runs (extremely unlikely since seeds are u64-random). Don't let a + cleanup failure turn into an assertion failure. """ from pg8000.native import identifier @@ -413,12 +351,6 @@ def _drop(sql: str) -> None: except Exception as exc: # noqa: BLE001 LOG.debug("cleanup tolerated: %s — %s", sql, exc) - cur.execute( - f"SELECT name FROM mz_clusters WHERE name LIKE 'cluster-{seed}-%'".encode() - ) - for row in cur.fetchall(): - _drop(f"DROP CLUSTER {identifier(row[0])} CASCADE") - cur.execute( f"SELECT name FROM mz_databases WHERE name LIKE 'db-pw-{seed}-%'".encode() ) @@ -440,11 +372,10 @@ def _create_database_for_antithesis(database: Database, exe: Executor) -> None: sql-server, and an external postgres source — none of those are running in this compose. - Catalog sweeps are scoped to objects this invocation owns: clusters - matching `cluster-{seed}-%` and roles matching `role-{seed}-%`. The - seed-scoped names are produced by `Database(seed_scoped_names=True)`; - cleaning anything broader would delete state belonging to other - concurrent invocations sharing the same SUT. + Catalog sweeps are scoped to objects this invocation owns: roles + matching `role-{seed}-%`. Pool clusters are NOT touched — they're + long-lived state shared across many invocations (one cluster per + pool slot, bootstrapped by the workload-entrypoint). The shared connections / secret (`kafka_conn`, `csr_conn`, `aws_conn`, `minio`) live outside any seed-scoped database and are required by every @@ -468,12 +399,6 @@ def _create_database_for_antithesis(database: Database, exe: Executor) -> None: # `seed` is the random_u64 the driver minted at the top of main(), so # it's already safe to splice into SQL literally. `Executor.execute` # takes a query string and doesn't support parameter binding. - exe.execute(f"SELECT name FROM mz_clusters WHERE name LIKE 'cluster-{seed}-%'") - for row in exe.cur.fetchall(): - _tolerate_setup_race( - exe.execute, f"DROP CLUSTER {identifier(row[0])} CASCADE" - ) - exe.execute(f"SELECT name FROM mz_roles WHERE name LIKE 'role-{seed}-%'") for row in exe.cur.fetchall(): _tolerate_setup_race(exe.execute, f"DROP ROLE {identifier(row[0])}") @@ -561,69 +486,53 @@ def main() -> int: _prepare_system(NUM_THREADS) - # Claim up to DESIRED_REPLICAS pool slots; the cluster runs with as - # many replicas as we got (≥1). Locks are held until main() returns; - # if no slot is free we tag a sometimes() and exit cleanly. + # Pick a pool slot at random. Each slot maps to a long-lived + # pool_cluster_ bootstrapped by the workload-entrypoint, with + # one replica on the matching clusterd-pool- container. # - # Each replica lands on its own clusterd-pool-{slot} container, so - # Antithesis can fault one replica's container without taking the - # cluster offline — exercises the same multi-replica recovery paths - # `antithesis_cluster` covers, but on the workload-driven cluster. - with _claim_pool_slots(rng, DESIRED_REPLICAS) as pool_slots: - sometimes( - len(pool_slots) > 0, - "parallel workload: clusterd pool slots claimed", - {"pool_size": CLUSTERD_POOL_SIZE, "claimed": len(pool_slots)}, - ) - sometimes( - len(pool_slots) >= DESIRED_REPLICAS, - "parallel workload: full multi-replica pool claim", - {"pool_size": CLUSTERD_POOL_SIZE, "desired": DESIRED_REPLICAS}, - ) - if not pool_slots: - LOG.info( - "parallel-workload exiting cleanly: no pool slots available " - "(pool_size=%d)", - CLUSTERD_POOL_SIZE, - ) - return 0 - pool_members = [ - ClusterdPoolMember( - host=f"clusterd-pool-{slot}", - workers=CLUSTERD_POOL_WORKERS, - ) - for slot in pool_slots - ] - LOG.info( - "parallel-workload claimed %d pool slot(s): %s", - len(pool_slots), - ", ".join(m.host for m in pool_members), - ) - return _run_invocation(seed, rng, pool_members) + # No coordination with other concurrent driver invocations: all + # workload state is in a seed-scoped database, so two invocations + # sharing a pool cluster don't collide. Antithesis still faults + # containers, not invocations, so the per-container fault domain + # is preserved; multiple invocations witnessing the same fault give + # us more independent reproductions per failure. + # + # Keeping the cluster identity per slot is what makes clusterd reuse + # safe across invocations (reconnects against the same cluster pass + # `InstanceConfig::compatible_with`; reconnects against a *different* + # cluster trip clusterd's introspection-index mismatch halt). + pool_slot = rng.randrange(CLUSTERD_POOL_SIZE) + cluster_name = f"pool_cluster_{pool_slot}" + LOG.info( + "parallel-workload using pool slot %d → cluster %s", + pool_slot, + cluster_name, + ) + return _run_invocation(seed, rng, cluster_name) def _run_invocation( seed: str, rng: random.Random, - pool_members: list[ClusterdPoolMember], + cluster_name: str, ) -> int: - """The bulk of `main()` once pool slot(s) have been claimed. Split out - so the slot locks stay held across this whole call: they are released - when the enclosing `with` block in `main()` exits. + """The bulk of `main()` once a pool slot has been claimed. Split out + so the slot lock stays held across this whole call: it's released when + the enclosing `with` block in `main()` exits. """ # `Scenario.Kill` widens `Action.errors_to_ignore` to absorb connection # drops, which mirrors what Antithesis container-pauses look like at the # client. We never instantiate `KillAction` itself. # - # `seed_scoped_names=True` keeps cluster/role names from colliding when - # concurrent invocations share the SUT — see _SETUP_RACE_PATTERNS for - # the fallback when they collide anyway. + # `seed_scoped_names=True` keeps role names from colliding when + # concurrent invocations share the SUT. # - # `pool_members=pool_members` makes a single unmanaged cluster with one - # replica per member; the framework forces managed=False and emits - # unmanaged CREATE CLUSTER with explicit STORAGECTL/COMPUTE ADDRESSES - # for each replica. + # `existing_cluster_name=cluster_name` makes the Database wrap the + # pool cluster bootstrapped at compose-up; the framework's + # CreateClusterAction / CreateClusterReplicaAction / + # DropClusterReplicaAction are disabled for it and Cluster.create() + # / Cluster.drop() are no-ops. database = Database( rng=rng, seed=seed, @@ -639,7 +548,7 @@ def _run_invocation( scenario=Scenario.Kill, naughty_identifiers=False, seed_scoped_names=True, - pool_members=pool_members, + existing_cluster_name=cluster_name, ) end_time = time.time() + RUNTIME_S diff --git a/test/antithesis/workload/workload-entrypoint.sh b/test/antithesis/workload/workload-entrypoint.sh index 1a8aab5234f51..bfef3f12e4c1a 100755 --- a/test/antithesis/workload/workload-entrypoint.sh +++ b/test/antithesis/workload/workload-entrypoint.sh @@ -17,6 +17,10 @@ PGUSER="${PGUSER:-materialize}" PGPORT_INTERNAL="${PGPORT_INTERNAL:-6877}" PGUSER_INTERNAL="${PGUSER_INTERNAL:-mz_system}" CLUSTER="${MZ_ANTITHESIS_CLUSTER:-antithesis_cluster}" +# Number of long-lived pool clusters to bootstrap, each bound to its own +# clusterd-pool-{i} container. Must match `ANTITHESIS_CLUSTERD_POOL_SIZE` +# in mzcompose.py and `CLUSTERD_POOL_SIZE` in the parallel-workload driver. +CLUSTERD_POOL_SIZE="${ANTITHESIS_CLUSTERD_POOL_SIZE:-8}" # Wait for materialized to be ready. echo "Waiting for materialized to become healthy..." @@ -64,6 +68,42 @@ else echo "Cluster '$CLUSTER' already exists; skipping provisioning." fi +# Bootstrap a long-lived `pool_cluster_{i}` for each clusterd-pool-{i} +# container. Each pool cluster has exactly one replica wired to its +# matching pool clusterd. Parallel-workload driver invocations claim a +# slot (via fcntl.flock on the workload container's filesystem) and run +# against `pool_cluster_{slot}` for their entire lifetime. The cluster +# identity is tied to the clusterd identity, so reconnects don't trip +# clusterd's `instance configuration not compatible` halt; only the +# seed-scoped database / roles get dropped between invocations. +# +# Idempotent: skip pool clusters that already exist (the SUT's catalog +# survives across `docker compose up` if metadata volumes aren't wiped). +for i in $(seq 0 $((CLUSTERD_POOL_SIZE - 1))); do + POOL_CLUSTER="pool_cluster_$i" + existing_pool=$( + psql -h "$PGHOST" -p "$PGPORT_INTERNAL" -U "$PGUSER_INTERNAL" -tAc \ + "SELECT 1 FROM mz_clusters WHERE name = '$POOL_CLUSTER'" + ) + if [[ -n "$existing_pool" ]]; then + echo "Pool cluster '$POOL_CLUSTER' already exists; skipping provisioning." + continue + fi + echo "Provisioning pool cluster '$POOL_CLUSTER' on clusterd-pool-$i..." + psql -h "$PGHOST" -p "$PGPORT_INTERNAL" -U "$PGUSER_INTERNAL" < Date: Thu, 14 May 2026 10:57:43 -0400 Subject: [PATCH 49/65] test/antithesis: add upsert-ancient-key-writable cross-invocation property MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sibling property upsert-key-reflects-latest-value tests only freshly- written keys within one invocation. This new property tests the complementary case: a key that has been resident in the upsert source for a long time (many invocations, many faults, possibly many clusterd restarts) must still accept a fresh write and have that write reflected in the source. The bug class this catches is a long-resident-state rehydration regression where the upsert operator's state-store remembers a key's value with enough fidelity to serve reads but enough wrongness that fresh writes are silently dropped — the user's pipeline appears stuck with no error. Implementation: parallel_driver_upsert_ancient_key_writable.py owns a dedicated key ring (ancient-k<0..31>) so it never collides with the sibling driver's per-invocation keys. Each invocation picks 5 ring slots at random, snapshots their current values, produces fresh 'cross--' values, waits for catchup, and asserts that each key's reflected value changed (or, for first-touch ring slots, that a row now exists). The 'always' assertion is race-tolerant against concurrent invocations of this driver writing to the same ring slot — the only forbidden outcome is 'row still has the exact old value we tried to overwrite, with no peer interference,' which means our write was silently lost. A separate 'sometimes' clause records when our specific new value reached the source as the win-the-race liveness signal. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../properties/upsert-ancient-key-writable.md | 72 +++++ ...llel_driver_upsert_ancient_key_writable.py | 273 ++++++++++++++++++ 2 files changed, 345 insertions(+) create mode 100644 test/antithesis/scratchbook/properties/upsert-ancient-key-writable.md create mode 100644 test/antithesis/workload/test/parallel_driver_upsert_ancient_key_writable.py diff --git a/test/antithesis/scratchbook/properties/upsert-ancient-key-writable.md b/test/antithesis/scratchbook/properties/upsert-ancient-key-writable.md new file mode 100644 index 0000000000000..c23448a04163b --- /dev/null +++ b/test/antithesis/scratchbook/properties/upsert-ancient-key-writable.md @@ -0,0 +1,72 @@ +# upsert-ancient-key-writable + +## Summary + +After a key has been resident in an UPSERT-envelope source for a long time (many invocations, many fault windows, many materialized restarts), writing a fresh `(key, value)` to that key must still update the source's view of it. + +The bug class this catches: an upsert state-store rehydration regression that "remembers" a key's old value with enough fidelity to serve reads but enough wrongness that fresh writes are silently dropped, leaving the source stuck at a value it shouldn't be at. + +## Why this isn't just `upsert-key-reflects-latest-value` + +`upsert-key-reflects-latest-value` is verified per invocation: each invocation writes its own short-lived keys, settles, asserts. The keys it touches were freshly created at the start of that invocation. The rehydration / long-resident-state code paths are exercised only incidentally, when an Antithesis fault lands between a produce and a check inside one invocation. + +`upsert-state-rehydrates-correctly` covers explicit clusterd-restart rehydration but only over a single invocation's worth of state, and only for read-after-rehydrate, not write-after-rehydrate. + +This property covers the gap: long-resident keys plus fresh writes against them. + +## Code paths + +- `src/storage/src/upsert.rs` — `upsert_classic` operator. The `multi_get` → check `from_time > prior_order` → `multi_put` sequence must work the same whether `multi_get` returns a value the worker freshly observed in this incarnation or one rehydrated from persist state at startup. +- `src/storage/src/upsert/types.rs` — `StateValue::ensure_decoded` finalizes consolidating state into either a `Value` or a tombstone. If `ensure_decoded` ever yields a stale value mismatching the persist state, this property will surface it after the next write. +- `src/storage/src/upsert_continual_feedback.rs` — same contract, persist-feedback flavor. + +## How to check it + +Workload procedure (per invocation): +1. Pick K=5 keys at random from a fixed ring of N=32 keys owned by this driver: `ancient-k{0..31}`. +2. For each picked key, `SELECT text FROM source WHERE key = ?` at real-time recency. Record `old_value` (which may be `NULL` if no prior invocation wrote that ring slot yet). +3. Produce a fresh value `cross--` to each picked key on Kafka. +4. Request a quiet period and wait for `offset_committed` to reach the produced max offset. +5. Re-query each key. Assert: + - If `old_value` was present: post-catchup the source's view must NOT equal `old_value`. Race-tolerant against concurrent peers writing or tombstoning the same key — those outcomes also change the value, and only "row still has the exact old value while no one else touched it" indicates our write was silently dropped. + - If `old_value` was absent (first-time write to that ring slot): post-catchup a row must exist for the key. + +## What goes wrong on violation + +A write to a long-resident upsert key is silently dropped: Kafka acked the produce, materialize ingested the message (`offset_committed` advanced), but the upsert state did not update. Read-only paths still return the old value; the user sees their pipeline "stuck" with no error. + +## Antithesis angle + +The interesting time window is the time between the ring slot's most recent write and the next one. In a long Antithesis run, that window spans many fault injections, clusterd restarts, and materialize-driven rehydrations of the upsert source's persist state. The longer the run, the more genuinely "ancient" the prior value is when we revisit. + +Combine with: +- Node-termination faults — exercises the rehydration path between writes to the same ring slot. +- Network-partition faults between materialized and clusterd-pool members — exercises feedback-channel recovery. +- Long-lived runs (multi-hour) — gives time for many ring-slot revisits with intervening faults. + +## Dependencies + +- The fixed ring `ancient-k{0..31}` is namespaced away from any sibling driver's keys, so this driver doesn't interfere with `upsert-key-reflects-latest-value`'s assertions. +- Two concurrent invocations of THIS driver picking the same ring slot is the race the `always` assertion is designed to tolerate. The `sometimes` clause "our specific new value reached the source" still fires when one invocation wins. + +## Existing instrumentation + +None. Candidate SUT anchors: `assert_sometimes!(upsert_long_resident_key_written, ...)` at the `multi_put` site, conditioned on the key's `from_time` being at least N minutes behind wall clock, would confirm the property's specific path is exercised. Deferred. + +## Implementation status + +Implemented as `test/antithesis/workload/test/parallel_driver_upsert_ancient_key_writable.py`. + +| Message | Type | Fires when | +|---------|------|------------| +| `"upsert: write to ancient key changes its reflected value"` | `always` | Per ancient ring slot that had a prior value, post-catchup. False ⟺ row still present with the exact pre-write value. | +| `"upsert: write to previously-empty ancient key creates a row"` | `always` | Per ancient ring slot that was empty before our write, post-catchup. False ⟺ no row exists despite our non-null produce + catchup. | +| `"upsert: at least one ancient-ring key has a prior value to overwrite"` | `sometimes` | Per invocation. Confirms the property's interesting path (overwrite, not first-touch) is exercised. | +| `"upsert: source caught up after cross-invocation produces"` | `sometimes` | Per invocation. Liveness for the catchup gate. | +| `"upsert: cross-invocation driver's own write reached the source"` | `sometimes` | Per invocation. Confirms the full write→catchup→read pipeline works end-to-end at least sometimes (most of the time, under low concurrency). | + +Knobs: `ANCIENT_KEY_RING_SIZE=32`, `ANCIENT_KEYS_PER_INVOCATION=5`, `QUIET_PERIOD_S=20`, `CATCHUP_TIMEOUT_S=60.0`. + +## Provenance + +Surfaced by: Data Integrity (long-lived upsert state correctness). diff --git a/test/antithesis/workload/test/parallel_driver_upsert_ancient_key_writable.py b/test/antithesis/workload/test/parallel_driver_upsert_ancient_key_writable.py new file mode 100644 index 0000000000000..296bf115fd425 --- /dev/null +++ b/test/antithesis/workload/test/parallel_driver_upsert_ancient_key_writable.py @@ -0,0 +1,273 @@ +#!/usr/bin/env python3 +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Antithesis driver for property `upsert-ancient-key-writable`. + +The sibling driver `parallel_driver_upsert_latest_value.py` writes keys +under a fresh per-invocation prefix (`p-k{0..7}`) and only ever +revisits its own keys *within* an invocation. Each key it writes is +abandoned at end-of-invocation — the row persists in the upsert source, +but no future invocation of that driver ever touches it again. + +This driver exercises the question: *if I write into a key that has +been resident in the upsert source for a long time, does the source +still reflect the write?* A failure here looks like an upsert state- +rehydration bug surfacing only after enough time / fault-injection has +elapsed: the source remembers the key from when it was originally +written, but a fresh write produces no observable change. + +To exercise that property without interfering with the sibling driver +(whose `always("upsert: SELECT for key matches latest produced value", +…)` assumes nothing concurrent is writing to its prefixed keys), this +driver owns its own key ring `ancient-k{0..N-1}`. Each invocation picks +K of them at random, snapshots their current values, produces fresh +values, waits for catchup, and asserts that the source's view of each +key changed. Between two invocations that happen to target the same +ring slot, a lot of wall time and fault injection may elapse — the +longer the run, the more genuinely "ancient" the snapshotted value is +at the time of the next overwrite. + +Assertion shape, per targeted key: + - `always`: post-catchup, the source's view of the key is NOT the + old value we snapshotted. The observed value can be (a) our new + value, (b) some other invocation's cross-overwrite, or (c) absent + (only possible if some concurrent peer tombstoned the key — this + driver never tombstones). The one outcome we must never see is + "row still present with the exact old value we just overwrote," which + means the write was silently dropped while no peer interfered. + - `sometimes`: the observed value equals OUR specific new value. + Liveness — confirms we sometimes win the race against any concurrent + peers and fully exercise the write+read pipeline through to query. + +Initial state for each key in the ring: until the first invocation +writes to ring slot `k`, no row exists for it. We tolerate that as +old_value=None and only assert post-catchup that the row now exists. +The first invocation to write each key seeds the property for later +ones. +""" + +from __future__ import annotations + +import logging +import sys + +import helper_random +from antithesis.assertions import always, sometimes +from helper_kafka import make_producer +from helper_pg import query_retry +from helper_quiet import request_quiet_period +from helper_source_stats import wait_for_catchup +from helper_upsert_source import ( + SOURCE_UPSERT_TEXT, + TOPIC_UPSERT_TEXT, + ensure_upsert_text_source, +) + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" +) +LOG = logging.getLogger("driver.upsert_ancient_key_writable") + +# Fixed key ring owned exclusively by this driver. No other driver writes +# keys matching this prefix, so the property's assertions are race-free +# against the rest of the workload. The size sets the upper bound on how +# many distinct "ancient" rows accumulate — small enough that revisits +# happen often, large enough that two concurrent invocations of this +# driver usually don't pick the same ring slot. +ANCIENT_KEY_PREFIX = "ancient-k" +ANCIENT_KEY_RING_SIZE = 32 + +# Number of ancient keys to target per invocation. Small on purpose — the +# Test Composer launches this driver many times, so coverage comes from +# many short invocations rather than one big one. +ANCIENT_KEYS_PER_INVOCATION = 5 + +QUIET_PERIOD_S = 20 +CATCHUP_TIMEOUT_S = 60.0 + + +def _produce(producer, tracker, topic: str, key: str, value: str) -> None: + producer.produce( + topic=topic, + key=key.encode("utf-8"), + value=value.encode("utf-8"), + on_delivery=tracker.callback, + ) + + +def _snapshot_current_value(key: str) -> tuple[bool, str | None]: + """Return (found, value) for `key` at a real-time-recency timestamp. + UPSERT contract: at most one row per key. + """ + rows = query_retry( + f"SELECT count(*)::bigint, max(text) FROM {SOURCE_UPSERT_TEXT} WHERE key = %s", + (key,), + real_time_recency=True, + ) + if not rows: + return False, None + count, value = rows[0] + if count == 0: + return False, None + if count != 1: + raise RuntimeError( + f"upsert source has {count} rows for key {key!r}; this driver assumes " + "the per-key uniqueness property holds" + ) + return True, value + + +def main() -> int: + ensure_upsert_text_source() + + # Per-invocation prefix is only used as a nonce-namespace for our + # written values, so triage can attribute a `cross--` + # value back to a specific invocation. The KEYS we target come from + # the shared ring, not from any per-invocation prefix. + prefix = f"p{helper_random.random_u64():016x}" + LOG.info("driver starting; prefix=%s", prefix) + + # Pick K distinct ring slots at random. The helper module doesn't + # expose `random_sample`, so we do reservoir-style sampling via + # repeated random_choice with removal. + candidate_pool = list(range(ANCIENT_KEY_RING_SIZE)) + slot_indices: list[int] = [] + for _ in range(min(ANCIENT_KEYS_PER_INVOCATION, len(candidate_pool))): + pick = helper_random.random_choice(candidate_pool) + candidate_pool.remove(pick) + slot_indices.append(pick) + keys = [f"{ANCIENT_KEY_PREFIX}{i}" for i in slot_indices] + + # Snapshot each key's current value BEFORE producing. This is the + # `old_value` half of the assertion. + snapshots: list[tuple[str, bool, str | None]] = [] + for key in keys: + found, value = _snapshot_current_value(key) + snapshots.append((key, found, value)) + + sometimes( + any(found for _, found, _ in snapshots), + "upsert: at least one ancient-ring key has a prior value to overwrite", + {"keys": [k for k, _, _ in snapshots]}, + ) + + producer, tracker = make_producer(client_id=f"antithesis-ancient-{prefix}") + + # Produce a fresh value to each ring slot. The value embeds our + # prefix + a per-write nonce so triage can distinguish "our write + # reached the source" from "some concurrent invocation wrote". + new_values: list[tuple[str, bool, str | None, str]] = [] + for key, found, old_value in snapshots: + nonce = helper_random.random_u64() + new_value = f"cross-{prefix}-{nonce:016x}" + new_values.append((key, found, old_value, new_value)) + _produce(producer, tracker, TOPIC_UPSERT_TEXT, key, new_value) + producer.poll(0) + + pending = producer.flush(timeout=30) + if pending > 0 or tracker.last_error is not None: + # Under sustained fault injection we can't prove which produces + # Kafka accepted. Bail before asserting — "writes that landed got + # reflected" doesn't apply to writes that didn't land. + LOG.info( + "skipping assertions: producer.flush pending=%d last_error=%s", + pending, + tracker.last_error, + ) + return 0 + + max_produced = tracker.topic_max_offset(TOPIC_UPSERT_TEXT) + if max_produced < 0: + LOG.info("no produces confirmed; exiting cleanly") + return 0 + + request_quiet_period(QUIET_PERIOD_S) + caught_up = wait_for_catchup( + SOURCE_UPSERT_TEXT, max_produced, timeout_s=CATCHUP_TIMEOUT_S + ) + sometimes( + caught_up, + "upsert: source caught up after cross-invocation produces", + {"source": SOURCE_UPSERT_TEXT, "target_offset": max_produced}, + ) + if not caught_up: + LOG.info("catchup did not complete in budget; skipping assertions") + return 0 + + my_value_observed = 0 + + for key, was_found, old_value, new_value in new_values: + found_after, observed = _snapshot_current_value(key) + + if was_found: + # Safety property: writing into a key that had a prior value + # must change the source's view. Accepted outcomes: + # * observed == new_value (we won the race) + # * observed == (peer won) + # * not found_after (peer tombstoned — this driver never + # does so it'd have to be a future variant or an + # external producer, but the shape is legitimate) + # The one outcome we must NEVER see is `found_after and + # observed == old_value`, which means our write was silently + # lost while no one else touched the key. + violation = found_after and observed == old_value + always( + not violation, + "upsert: write to ancient key changes its reflected value", + { + "source": SOURCE_UPSERT_TEXT, + "key": key, + "old_value": old_value, + "new_value_attempted": new_value, + "observed_present": found_after, + "observed_value": observed, + }, + ) + else: + # First-touch path: the ring slot was empty before. After + # producing a non-null value, the source must contain a row. + # The row's value is either ours or a peer's cross-overwrite; + # both are valid. The one outcome we must never see is + # `not found_after` — meaning a non-tombstone write to an + # empty key produced no row. + always( + found_after, + "upsert: write to previously-empty ancient key creates a row", + { + "source": SOURCE_UPSERT_TEXT, + "key": key, + "new_value_attempted": new_value, + "observed_present": found_after, + "observed_value": observed, + }, + ) + + if found_after and observed == new_value: + my_value_observed += 1 + + sometimes( + my_value_observed > 0, + "upsert: cross-invocation driver's own write reached the source", + { + "my_value_observed": my_value_observed, + "ancient_keys_targeted": len(new_values), + }, + ) + + LOG.info( + "driver done; ancient_keys=%d my_value_observed=%d", + len(new_values), + my_value_observed, + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From 891668be02b75834c6c404f8357e44d9f09de284 Mon Sep 17 00:00:00 2001 From: Patrick Butler Date: Thu, 14 May 2026 11:25:29 -0400 Subject: [PATCH 50/65] add assertion for gtid monotonicity violation in mysql --- .../source/mysql/replication/partitions.rs | 10 ++ ...ysql-source-gtid-monotonicity-violation.md | 117 +++++++++++++++ .../scratchbook/property-catalog.md | 15 +- .../anytime_mysql_source_no_gtid_errors.py | 142 ++++++++++++++++++ 4 files changed, 283 insertions(+), 1 deletion(-) create mode 100644 test/antithesis/scratchbook/properties/mysql-source-gtid-monotonicity-violation.md create mode 100644 test/antithesis/workload/test/anytime_mysql_source_no_gtid_errors.py diff --git a/src/storage/src/source/mysql/replication/partitions.rs b/src/storage/src/source/mysql/replication/partitions.rs index c4a6a9ba743bc..7aef48cb3c2c8 100644 --- a/src/storage/src/source/mysql/replication/partitions.rs +++ b/src/storage/src/source/mysql/replication/partitions.rs @@ -11,6 +11,8 @@ use std::collections::BTreeMap; +use antithesis_sdk::assert_unreachable; +use serde_json::json; use timely::progress::Antichain; use uuid::Uuid; @@ -92,6 +94,14 @@ impl GtidReplicationPartitions { // should only see GTID transaction-ids // in a monotonic order for each source, starting at that upper. if active_part.timestamp() > new_part.timestamp() { + assert_unreachable!( + "mysql: BinlogGtidMonotonicityViolation — received out-of-order GTID from multithreaded replica", + &json!({ + "source_uuid": source_id.to_string(), + "active_timestamp": format!("{:?}", active_part.timestamp()), + "new_timestamp": format!("{:?}", new_part.timestamp()), + }) + ); let err = DefiniteError::BinlogGtidMonotonicityViolation( source_id.to_string(), new_part.timestamp().clone(), diff --git a/test/antithesis/scratchbook/properties/mysql-source-gtid-monotonicity-violation.md b/test/antithesis/scratchbook/properties/mysql-source-gtid-monotonicity-violation.md new file mode 100644 index 0000000000000..01e90975c4ba7 --- /dev/null +++ b/test/antithesis/scratchbook/properties/mysql-source-gtid-monotonicity-violation.md @@ -0,0 +1,117 @@ +# mysql-source-gtid-monotonicity-violation — MySQL Source Must Not Enter Errored State Due to Out-of-Order GTIDs + +## Summary + +The Materialize MySQL CDC source must never receive GTIDs out of monotonic +order from the multithreaded replica. If it does, `BinlogGtidMonotonicityViolation` +(a `DefiniteError`) permanently errors the source — there is no self-recovery path. + +The pipeline is: + +``` +MySQL primary (GTID + WRITESET dependency tracking) + | + +--> MySQL replica (4 parallel workers, replica_preserve_commit_order=ON) + | + Materialize CDC source (mysql_cdc_source, antithesis_cluster) + | + antithesis_cdc table +``` + +With `replica_preserve_commit_order=ON` the replica guarantees it applies +transactions in primary-commit order even with 4 concurrent applier threads. +Under Antithesis fault injection — scheduling jitter, container kills at +arbitrary points, network delays — this guarantee is stress-tested. + +## The Error + +`DefiniteError::BinlogGtidMonotonicityViolation` is raised in +`src/storage/src/source/mysql/replication/partitions.rs:advance_frontier` +when the per-UUID GTID `active_part.timestamp() > new_part.timestamp()`: +a new GTID has a lower transaction-id than one the source already processed. + +Error message: `"received out of order gtids for source {uuid} at transaction-id {txn}"` + +Once emitted, this `DefiniteError` flows to `DataflowError::SourceError` and +the source is permanently in the "errored" state. The only recovery is a +user-initiated `DROP SOURCE` + recreate. + +## Instrumentation + +**SUT-side** — `src/storage/src/source/mysql/replication/partitions.rs`. + +`assert_unreachable!("mysql: BinlogGtidMonotonicityViolation — received out-of-order GTID from multithreaded replica", …)` fires immediately before the `DefiniteError` is returned. This gives Antithesis a precise, reproducible anchor at the exact site where the violation is detected — before the error propagates and the source enters the errored state. + +**Workload-side** — `test/antithesis/workload/test/anytime_mysql_source_no_gtid_errors.py`. + +`anytime_` driver polls `mz_internal.mz_source_statuses` every 2 s for the +`mysql_cdc_source`. When `status = 'errored'` AND `error` contains +`"out of order gtids"`, fires: + +```python +always( + not is_gtid_error, # True normally; False triggers the property failure + "mysql: source must not enter errored state due to out-of-order GTIDs", + {"source": SOURCE_NAME, "status": status, "error": error, …}, +) +``` + +The workload-side check is complementary: it observes the effect at the +user-visible surface, while the SUT-side assertion fires at the exact causal +site inside the source operator. + +## Why This Property Matters + +With `replica_preserve_commit_order=ON` enabled, out-of-order GTIDs should +be impossible. This property tests whether Antithesis can find a schedule +(crash timing, worker scheduling delay, partial replica state) under which +the commit-order guarantee breaks down. A violation surfaces as: + +1. The SUT-side `assert_unreachable!` firing (gives Antithesis a replay anchor). +2. The source permanently stuck in "errored" state. +3. The `mysql-source-no-data-loss` `always()` assertions becoming vacuous + (catchup never completes, liveness anchor never fires). + +## Assertion Types Chosen + +- `Unreachable` (SUT-side): the violation path in `advance_frontier` should + never be reached. `assert_unreachable!` converts the error site into a + reportable Antithesis property. + +- `always(not is_gtid_error)` (workload-side): the observable effect (source + in "errored" state due to this error) must never be true. `always()` is + correct because this is a hard safety invariant — every observation must hold. + +## Related Properties + +- `mysql-source-no-data-loss` — shares the MySQL CDC pipeline; a GTID + ordering violation will also cause the data-loss property assertions to + become vacuous (catchup never completes once the source is errored). +- `storage-command-replay-idempotent` — MySQL CDC resume on clusterd restart + also exercises GTID position tracking; a corrupted GTID state after restart + could trigger this violation. + +## Schema + +```sql +-- MySQL: commit-order-preserving multithreaded replication +SET GLOBAL replica_parallel_workers = 4; +SET GLOBAL replica_preserve_commit_order = ON; + +-- Materialize CDC source (reads from mysql-replica) +CREATE SOURCE mysql_cdc_source IN CLUSTER antithesis_cluster + FROM MYSQL CONNECTION antithesis_mysql_conn; +CREATE TABLE antithesis_cdc + FROM SOURCE mysql_cdc_source (REFERENCE antithesis.cdc_test); +``` + +## SUT Code Path + +``` +mysql/replication/partitions.rs :: GtidReplicationPartitions::advance_frontier + -> active_part.timestamp() > new_part.timestamp() + -> assert_unreachable!("mysql: BinlogGtidMonotonicityViolation …") ← NEW + -> DefiniteError::BinlogGtidMonotonicityViolation(source_id, txn_id) + -> ReplicationError::Definite(…) + -> source enters "errored" state permanently +``` diff --git a/test/antithesis/scratchbook/property-catalog.md b/test/antithesis/scratchbook/property-catalog.md index d4074c3bf7e2e..ec139c3ea4ae9 100644 --- a/test/antithesis/scratchbook/property-catalog.md +++ b/test/antithesis/scratchbook/property-catalog.md @@ -1,8 +1,9 @@ --- commit: 007c7af9d9970fb2030c7212368b232e0fbc363e -updated: 2026-05-12 +updated: 2026-05-14 --- + # Property Catalog: Materialize @@ -442,6 +443,18 @@ commit-order preservation) to the Antithesis environment. | **Antithesis Angle** | Kills to the MySQL replica container (replica restarts from persisted GTID position); kills to the MySQL primary (replica and Materialize source must handle upstream silence gracefully); clusterd restarts (MySQL CDC resume exercises the same `storage-command-replay-idempotent` path as Kafka); parallel worker scheduling jitter that stresses the `replica_preserve_commit_order` protocol. | | **Why It Matters** | MySQL CDC is a distinct ingestion code path from Kafka. Wrong behavior here — dropped rows, wrong values after restart, duplicate rows after resume — is not caught by the Kafka-source drivers. | +### mysql-source-gtid-monotonicity-violation — MySQL Source Must Not Error Due to Out-of-Order GTIDs + +| | | +|---|---| +| **Type** | Safety (Unreachable) | +| **Priority** | P1 — permanent source error with no self-recovery path; directly testable by Antithesis fault injection against the multithreaded replica's commit-order protocol | +| **Status** | **Implemented (SUT-side + workload-side)** — `src/storage/src/source/mysql/replication/partitions.rs`: `assert_unreachable!("mysql: BinlogGtidMonotonicityViolation — received out-of-order GTID from multithreaded replica", …)` fires immediately before `DefiniteError::BinlogGtidMonotonicityViolation` is returned in `advance_frontier`, giving Antithesis a precise replay anchor at the exact causal site. Workload-side: `test/antithesis/workload/test/anytime_mysql_source_no_gtid_errors.py` polls `mz_internal.mz_source_statuses` every 2 s and fires `always(not is_gtid_error, "mysql: source must not enter errored state due to out-of-order GTIDs", …)` at the user-visible error surface. | +| **Property** | The Materialize MySQL CDC source must never receive a GTID with a lower transaction-id than one already observed for the same UUID. With `replica_preserve_commit_order=ON` and 4 parallel replica workers, the commit-order protocol must hold even under Antithesis fault injection. | +| **Invariant** | `Unreachable`: the `BinlogGtidMonotonicityViolation` error site in `advance_frontier` must never be reached. `Always`: `mz_internal.mz_source_statuses` for the MySQL CDC source must never show `status = 'errored'` with `error` containing `"out of order gtids"`. | +| **Antithesis Angle** | Scheduling jitter under 4 parallel replica workers; container kills of the replica at arbitrary replication progress points; network delays between primary and replica that could desynchronize the commit-order queue. The property tests whether `replica_preserve_commit_order=ON` holds its guarantee when Antithesis controls the scheduler. | +| **Why It Matters** | `BinlogGtidMonotonicityViolation` is a `DefiniteError` — the source is permanently stuck with no self-recovery path. It also silently neutralizes the `mysql-source-no-data-loss` liveness assertions (catchup never completes once the source is errored). Surfaced by: MySQL CDC source configuration, multithreaded replication correctness. | + ### offset-known-not-below-committed — Source Statistics Causality | | | diff --git a/test/antithesis/workload/test/anytime_mysql_source_no_gtid_errors.py b/test/antithesis/workload/test/anytime_mysql_source_no_gtid_errors.py new file mode 100644 index 0000000000000..264ad67584f1a --- /dev/null +++ b/test/antithesis/workload/test/anytime_mysql_source_no_gtid_errors.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python3 +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Antithesis driver for `mysql-source-gtid-monotonicity-violation`. + +The MySQL CDC source must never enter an errored state because of +"received out of order gtids". This error (BinlogGtidMonotonicityViolation) +fires when the multithreaded replica delivers a GTID with a lower +transaction-id than what was already observed for that UUID — permanently +erroring the source with no self-recovery path. + +With 4 parallel replica workers and `replica_preserve_commit_order=ON`, +this should be impossible: the commit-order protocol guarantees that GTIDs +arrive in primary-commit order. But under Antithesis fault injection +(scheduling jitter, container kills, network delays) the commit-order +guarantee could be tested. This driver records an `always()` failure the +moment the errored state is observed, giving Antithesis a reportable +property violation with a deterministic replay anchor. + +This is an `anytime_` driver — it runs continuously throughout the timeline +so faults active during its polling window can be correlated with the first +observed error. A bounded run budget (`RUN_BUDGET_S`) prevents one instance +from pinning resources; Antithesis re-launches it freely. + +Error-state detection is workload-observable: `mz_internal.mz_source_statuses` +reports `status = 'errored'` with `error` containing the error message. We +check for the specific substring "out of order gtids" so the assertion is +tight and won't fire on unrelated source errors. + +The complementary SUT-side assertion lives in +`src/storage/src/source/mysql/replication/partitions.rs`: +`assert_unreachable!("mysql: BinlogGtidMonotonicityViolation …")`. +""" + +from __future__ import annotations + +import logging +import sys +import time + +from antithesis.assertions import always +from helper_mysql_source import SOURCE_NAME +from helper_pg import query_retry + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" +) +LOG = logging.getLogger("driver.mysql_source_no_gtid_errors") + +# Knobs. +POLL_INTERVAL_S = 2.0 +RUN_BUDGET_S = 60.0 + +# Substring that identifies the specific error this property targets. +_GTID_ORDER_ERROR = "out of order gtids" + + +def _source_status() -> tuple[str, str | None] | None: + """Query status and error for the MySQL CDC source. + + Returns (status, error_message) or None if the source doesn't exist yet + or the query fails (both are expected early in a timeline). + """ + try: + rows = query_retry( + """ + SELECT ss.status, ss.error + FROM mz_internal.mz_source_statuses ss + JOIN mz_sources s ON s.id = ss.id + WHERE s.name = %s + """, + (SOURCE_NAME,), + ) + except Exception as exc: # noqa: BLE001 + LOG.info("source status query failed: %s", exc) + return None + if not rows: + return None + status, error = rows[0] + return (status, error) + + +def main() -> int: + deadline = time.monotonic() + RUN_BUDGET_S + checks = 0 + + while time.monotonic() < deadline: + result = _source_status() + if result is None: + time.sleep(POLL_INTERVAL_S) + continue + + status, error = result + checks += 1 + + is_gtid_error = ( + status == "errored" + and error is not None + and _GTID_ORDER_ERROR in error.lower() + ) + + always( + not is_gtid_error, + "mysql: source must not enter errored state due to out-of-order GTIDs", + { + "source": SOURCE_NAME, + "status": status, + "error": error, + "note": ( + "BinlogGtidMonotonicityViolation fired — multithreaded replica " + "delivered a GTID with lower txn-id than previously observed; " + "replica_preserve_commit_order protocol violated under fault injection" + ), + }, + ) + + if is_gtid_error: + LOG.error( + "gtid monotonicity violation detected: status=%s error=%s", + status, + error, + ) + + time.sleep(POLL_INTERVAL_S) + + LOG.info( + "mysql-source-no-gtid-errors done; %d status checks over %.0fs", + checks, + RUN_BUDGET_S, + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From 86d1fbbc058b019dd170263a7e202427250e7023 Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Thu, 14 May 2026 11:47:00 -0400 Subject: [PATCH 51/65] test/antithesis: bump clusterd workers to 16 and shrink pool to 2 Workers per clusterd: 4 -> 16. Single-process clusterds at workers=16 exercise the same intra-process concurrency surface as a 4-process scale=4,workers=4 production deployment, giving us realistic per-shard parallelism, scheduler contention, and Antithesis-thread-pause-fault depth. Pool size: 8 -> 2. The no-lock allocator already tolerates oversubscription (concurrent invocations may share a pool cluster because every workload object lives in a seed-scoped database), so a smaller pool isn't a correctness concern. A pool of 2 keeps the topology closer to production replica counts and makes each pool cluster behave more like a busy production cluster. Workers count is now plumbed through end-to-end: * mzcompose.py declares CLUSTERD_WORKERS=16 and uses it for every Clusterd(...) service AND exports it in the Workload service env. * workload-entrypoint.sh reads CLUSTERD_WORKERS and templates it into every CREATE CLUSTER REPLICAS' WORKERS clause (antithesis_cluster plus each pool_cluster_). The controller reads WORKERS from this clause, not from clusterd's runtime config, so the two must stay in lockstep. Total worker thread count goes from 4*8 + 4*2 = 40 (old: 8 pool + 2 antithesis) to 16*2 + 16*2 = 64 (new: 2 pool + 2 antithesis). Modest memory increase, big throughput / parallelism gain. Co-Authored-By: Claude Opus 4.7 (1M context) --- test/antithesis/config/docker-compose.yaml | 249 +----------------- test/antithesis/mzcompose.py | 79 ++++-- .../parallel-workload-fault-isolation.md | 30 ++- .../test/parallel_driver_parallel_workload.py | 2 +- .../workload/workload-entrypoint.sh | 27 +- 5 files changed, 99 insertions(+), 288 deletions(-) diff --git a/test/antithesis/config/docker-compose.yaml b/test/antithesis/config/docker-compose.yaml index e1439653361bb..8b162e1224a78 100644 --- a/test/antithesis/config/docker-compose.yaml +++ b/test/antithesis/config/docker-compose.yaml @@ -258,10 +258,10 @@ services: - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879 - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0 - CLUSTERD_PROCESS=0 - - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd1:2102"], + - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 16, "process": 0, "addresses": ["clusterd1:2102"], "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc": false, "zero_copy_limit": null}' - - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd1:2103"], + - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 16, "process": 0, "addresses": ["clusterd1:2103"], "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc": false, "zero_copy_limit": null}' volumes: @@ -296,10 +296,10 @@ services: - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879 - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0 - CLUSTERD_PROCESS=0 - - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd2:2102"], + - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 16, "process": 0, "addresses": ["clusterd2:2102"], "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc": false, "zero_copy_limit": null}' - - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd2:2103"], + - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 16, "process": 0, "addresses": ["clusterd2:2103"], "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc": false, "zero_copy_limit": null}' volumes: @@ -334,10 +334,10 @@ services: - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879 - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0 - CLUSTERD_PROCESS=0 - - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-0:2102"], + - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 16, "process": 0, "addresses": ["clusterd-pool-0:2102"], "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc": false, "zero_copy_limit": null}' - - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-0:2103"], + - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 16, "process": 0, "addresses": ["clusterd-pool-0:2103"], "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc": false, "zero_copy_limit": null}' volumes: @@ -372,238 +372,10 @@ services: - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879 - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0 - CLUSTERD_PROCESS=0 - - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-1:2102"], + - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 16, "process": 0, "addresses": ["clusterd-pool-1:2102"], "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc": false, "zero_copy_limit": null}' - - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-1:2103"], - "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc": - false, "zero_copy_limit": null}' - volumes: - - mzdata:/mzdata - - mydata:/var/lib/mysql-files - - tmp:/share/tmp - - scratch:/scratch - restart: 'no' - stop_grace_period: 120s - platform: linux/amd64 - image: ${MATERIALIZED_IMAGE} - clusterd-pool-2: - entrypoint: - - tini - - -- - command: - - clusterd - ports: - - 2100 - - 2101 - - 6878 - environment: - - CLUSTERD_GRPC_HOST=clusterd-pool-2 - - CLUSTERD_USE_CTP=true - - MZ_SOFT_ASSERTIONS=1 - - CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2100 - - CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2101 - - CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=0.0.0.0:6878 - - CLUSTERD_SECRETS_READER=local-file - - CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=/mzdata/secrets - - LD_PRELOAD=libeatmydata.so - - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879 - - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0 - - CLUSTERD_PROCESS=0 - - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-2:2102"], - "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc": - false, "zero_copy_limit": null}' - - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-2:2103"], - "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc": - false, "zero_copy_limit": null}' - volumes: - - mzdata:/mzdata - - mydata:/var/lib/mysql-files - - tmp:/share/tmp - - scratch:/scratch - restart: 'no' - stop_grace_period: 120s - platform: linux/amd64 - image: ${MATERIALIZED_IMAGE} - clusterd-pool-3: - entrypoint: - - tini - - -- - command: - - clusterd - ports: - - 2100 - - 2101 - - 6878 - environment: - - CLUSTERD_GRPC_HOST=clusterd-pool-3 - - CLUSTERD_USE_CTP=true - - MZ_SOFT_ASSERTIONS=1 - - CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2100 - - CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2101 - - CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=0.0.0.0:6878 - - CLUSTERD_SECRETS_READER=local-file - - CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=/mzdata/secrets - - LD_PRELOAD=libeatmydata.so - - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879 - - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0 - - CLUSTERD_PROCESS=0 - - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-3:2102"], - "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc": - false, "zero_copy_limit": null}' - - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-3:2103"], - "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc": - false, "zero_copy_limit": null}' - volumes: - - mzdata:/mzdata - - mydata:/var/lib/mysql-files - - tmp:/share/tmp - - scratch:/scratch - restart: 'no' - stop_grace_period: 120s - platform: linux/amd64 - image: ${MATERIALIZED_IMAGE} - clusterd-pool-4: - entrypoint: - - tini - - -- - command: - - clusterd - ports: - - 2100 - - 2101 - - 6878 - environment: - - CLUSTERD_GRPC_HOST=clusterd-pool-4 - - CLUSTERD_USE_CTP=true - - MZ_SOFT_ASSERTIONS=1 - - CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2100 - - CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2101 - - CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=0.0.0.0:6878 - - CLUSTERD_SECRETS_READER=local-file - - CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=/mzdata/secrets - - LD_PRELOAD=libeatmydata.so - - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879 - - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0 - - CLUSTERD_PROCESS=0 - - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-4:2102"], - "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc": - false, "zero_copy_limit": null}' - - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-4:2103"], - "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc": - false, "zero_copy_limit": null}' - volumes: - - mzdata:/mzdata - - mydata:/var/lib/mysql-files - - tmp:/share/tmp - - scratch:/scratch - restart: 'no' - stop_grace_period: 120s - platform: linux/amd64 - image: ${MATERIALIZED_IMAGE} - clusterd-pool-5: - entrypoint: - - tini - - -- - command: - - clusterd - ports: - - 2100 - - 2101 - - 6878 - environment: - - CLUSTERD_GRPC_HOST=clusterd-pool-5 - - CLUSTERD_USE_CTP=true - - MZ_SOFT_ASSERTIONS=1 - - CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2100 - - CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2101 - - CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=0.0.0.0:6878 - - CLUSTERD_SECRETS_READER=local-file - - CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=/mzdata/secrets - - LD_PRELOAD=libeatmydata.so - - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879 - - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0 - - CLUSTERD_PROCESS=0 - - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-5:2102"], - "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc": - false, "zero_copy_limit": null}' - - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-5:2103"], - "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc": - false, "zero_copy_limit": null}' - volumes: - - mzdata:/mzdata - - mydata:/var/lib/mysql-files - - tmp:/share/tmp - - scratch:/scratch - restart: 'no' - stop_grace_period: 120s - platform: linux/amd64 - image: ${MATERIALIZED_IMAGE} - clusterd-pool-6: - entrypoint: - - tini - - -- - command: - - clusterd - ports: - - 2100 - - 2101 - - 6878 - environment: - - CLUSTERD_GRPC_HOST=clusterd-pool-6 - - CLUSTERD_USE_CTP=true - - MZ_SOFT_ASSERTIONS=1 - - CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2100 - - CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2101 - - CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=0.0.0.0:6878 - - CLUSTERD_SECRETS_READER=local-file - - CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=/mzdata/secrets - - LD_PRELOAD=libeatmydata.so - - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879 - - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0 - - CLUSTERD_PROCESS=0 - - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-6:2102"], - "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc": - false, "zero_copy_limit": null}' - - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-6:2103"], - "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc": - false, "zero_copy_limit": null}' - volumes: - - mzdata:/mzdata - - mydata:/var/lib/mysql-files - - tmp:/share/tmp - - scratch:/scratch - restart: 'no' - stop_grace_period: 120s - platform: linux/amd64 - image: ${MATERIALIZED_IMAGE} - clusterd-pool-7: - entrypoint: - - tini - - -- - command: - - clusterd - ports: - - 2100 - - 2101 - - 6878 - environment: - - CLUSTERD_GRPC_HOST=clusterd-pool-7 - - CLUSTERD_USE_CTP=true - - MZ_SOFT_ASSERTIONS=1 - - CLUSTERD_STORAGE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2100 - - CLUSTERD_COMPUTE_CONTROLLER_LISTEN_ADDR=0.0.0.0:2101 - - CLUSTERD_INTERNAL_HTTP_LISTEN_ADDR=0.0.0.0:6878 - - CLUSTERD_SECRETS_READER=local-file - - CLUSTERD_SECRETS_READER_LOCAL_FILE_DIR=/mzdata/secrets - - LD_PRELOAD=libeatmydata.so - - CLUSTERD_PERSIST_PUBSUB_URL=http://materialized:6879 - - CLUSTERD_ENVIRONMENT_ID=mzcompose-us-east-1-00000000-0000-0000-0000-000000000000-0 - - CLUSTERD_PROCESS=0 - - 'CLUSTERD_COMPUTE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-7:2102"], - "arrangement_exert_proportionality": 16, "enable_zero_copy": false, "enable_zero_copy_lgalloc": - false, "zero_copy_limit": null}' - - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 4, "process": 0, "addresses": ["clusterd-pool-7:2103"], + - 'CLUSTERD_STORAGE_TIMELY_CONFIG={"workers": 16, "process": 0, "addresses": ["clusterd-pool-1:2103"], "arrangement_exert_proportionality": 1337, "enable_zero_copy": false, "enable_zero_copy_lgalloc": false, "zero_copy_limit": null}' volumes: @@ -828,8 +600,9 @@ services: - KAFKA_BROKER=kafka:9092 - SCHEMA_REGISTRY_URL=http://schema-registry:8081 - MZ_ANTITHESIS_CLUSTER=antithesis_cluster - - ANTITHESIS_CLUSTERD_POOL_SIZE=8 - - CLUSTERD_POOL_SIZE=8 + - ANTITHESIS_CLUSTERD_POOL_SIZE=2 + - CLUSTERD_POOL_SIZE=2 + - CLUSTERD_WORKERS=16 - MYSQL_HOST=mysql - MYSQL_REPLICA_HOST=mysql-replica - MYSQL_PASSWORD=p@ssw0rd diff --git a/test/antithesis/mzcompose.py b/test/antithesis/mzcompose.py index 2a75dce53b4e0..bfbc1abd6d4ca 100644 --- a/test/antithesis/mzcompose.py +++ b/test/antithesis/mzcompose.py @@ -54,11 +54,25 @@ # Number of pool clusterd containers reserved for parallel-workload clusters # (one container per cluster, giving each its own container-level fault # domain). Read from the env so CI/local runs can tune it without editing -# this file. Default 8 — enough for ~8 concurrent parallel-driver -# invocations under the v1 "one cluster per invocation, replication -# factor 1" allocation, see test/antithesis/workload/test/ -# parallel_driver_parallel_workload.py. -CLUSTERD_POOL_SIZE = int(os.environ.get("ANTITHESIS_CLUSTERD_POOL_SIZE", "8")) +# this file. Default 2 — the no-lock allocator (rng-picked slot per +# invocation) tolerates oversubscription, and a smaller pool keeps the +# topology closer to production replica counts. +CLUSTERD_POOL_SIZE = int(os.environ.get("ANTITHESIS_CLUSTERD_POOL_SIZE", "2")) + +# Timely worker threads per clusterd process. Bumped to 16 to match the +# per-process worker density of larger production cluster sizes — single- +# process clusterds at workers=16 cover the same intra-process +# concurrency surface as a 4-process scale=4,workers=4 production +# deployment, so we exercise per-shard parallelism, scheduler contention, +# and the Antithesis thread-pause fault target with realistic depth. +# +# This value must stay in lockstep with the `WORKERS N` clause in every +# CREATE CLUSTER REPLICAS statement that targets these containers +# (workload-entrypoint.sh reads it from the CLUSTERD_WORKERS env var +# the Workload service passes through; the parallel-workload Python +# driver consumes the same env via the framework's pool-cluster +# wrapper). +CLUSTERD_WORKERS = 16 class Workload(Service): @@ -94,6 +108,13 @@ def __init__(self) -> None: # slot count. f"ANTITHESIS_CLUSTERD_POOL_SIZE={CLUSTERD_POOL_SIZE}", f"CLUSTERD_POOL_SIZE={CLUSTERD_POOL_SIZE}", + # Worker count for the WORKERS clause in every CREATE + # CLUSTER REPLICAS that targets a clusterd-pool or + # clusterd1/2 container. Must match the `workers=` + # argument passed to each `Clusterd(...)` Service above, + # because the controller reads it from this clause not + # from clusterd's runtime config. + f"CLUSTERD_WORKERS={CLUSTERD_WORKERS}", # MySQL primary and replica connection details. "MYSQL_HOST=mysql", "MYSQL_REPLICA_HOST=mysql-replica", @@ -145,14 +166,16 @@ def __init__(self) -> None: # Antithesis kill either replica's backing container without taking # the workload offline. # - # `workers=4` per clusterd means each replica runs four timely worker - # threads in one process. The extra intra-process parallelism is the - # surface area Antithesis's thread-pausing fault targets — with a - # single worker, "pause one thread" effectively pauses the whole - # process, which the container-pause fault already covers. The matching - # `WORKERS 4` in the CREATE CLUSTER REPLICAS statement must stay in - # lockstep with this value (it's read by the controller, not by - # clusterd). + # `workers=CLUSTERD_WORKERS` (16) per clusterd means each replica runs + # that many timely worker threads in one process. Sized to cover the + # per-process worker density of larger production cluster sizes: + # single-process clusterds at workers=16 exercise the same + # intra-process concurrency surface as a 4-process scale=4,workers=4 + # production deployment (per-shard parallelism, scheduler contention, + # Antithesis thread-pause fault targets). The matching `WORKERS N` + # clause in every CREATE CLUSTER REPLICAS statement must equal this + # — workload-entrypoint.sh reads CLUSTERD_WORKERS from the env the + # Workload service exports. # # `scratch_directory=None` matches production: cluster replicas in # cloud deployments don't get a scratch disk, so the upsert operator's @@ -165,31 +188,33 @@ def __init__(self) -> None: # loops on clusterd1 in an earlier run). Clusterd( name="clusterd1", - workers=4, + workers=CLUSTERD_WORKERS, scratch_directory=None, ), Clusterd( name="clusterd2", - workers=4, + workers=CLUSTERD_WORKERS, scratch_directory=None, ), # Pool of identical clusterd containers reserved for the - # parallel-workload driver. Each instance is a possible target for - # one parallel-workload cluster, giving that cluster its own - # container-level fault domain (Antithesis can kill / pause / - # partition / throttle a specific pool member without affecting any - # other cluster). Same settings as clusterd1/clusterd2: 4 timely - # workers per process, no scratch (matches production), restart=no - # so Antithesis fault injection isn't fought by docker-compose. + # parallel-workload driver. Each instance backs one long-lived + # `pool_cluster_` (bootstrapped by workload-entrypoint.sh), giving + # that cluster its own container-level fault domain (Antithesis can + # kill / pause / partition / throttle a specific pool member without + # affecting any other cluster). Same settings as clusterd1/clusterd2: + # workers=CLUSTERD_WORKERS, no scratch (matches production), + # restart=no so Antithesis fault injection isn't fought by docker- + # compose. # - # Sizing rationale lives in test/antithesis/workload/test/ - # parallel_driver_parallel_workload.py — the driver maps invocation - # seed → pool slot deterministically and assumes the pool is at - # least as big as the expected concurrent-invocation count. + # Pool sizing rationale lives in test/antithesis/workload/test/ + # parallel_driver_parallel_workload.py — the driver picks a slot at + # random per invocation; with the no-lock allocator, multiple + # invocations may share a pool cluster (which is fine because every + # workload object lives in a seed-scoped database). *[ Clusterd( name=f"clusterd-pool-{i}", - workers=4, + workers=CLUSTERD_WORKERS, scratch_directory=None, ) for i in range(CLUSTERD_POOL_SIZE) diff --git a/test/antithesis/scratchbook/parallel-workload-fault-isolation.md b/test/antithesis/scratchbook/parallel-workload-fault-isolation.md index 884a030b704db..3aa97e55078d5 100644 --- a/test/antithesis/scratchbook/parallel-workload-fault-isolation.md +++ b/test/antithesis/scratchbook/parallel-workload-fault-isolation.md @@ -43,12 +43,15 @@ never dropped. Components, bottom-up: - - **`Clusterd(name="clusterd-pool-{i}", workers=4, scratch_directory=None)`** - in `test/antithesis/mzcompose.py`. Same configuration as - `clusterd1`/`clusterd2`: four timely workers per process (so - Antithesis thread-pause faults have something distinct to pause), - mem_env RocksDB (matches production, no scratch volume to fight over). - Pool size from env (`ANTITHESIS_CLUSTERD_POOL_SIZE`, default 8). + - **`Clusterd(name="clusterd-pool-{i}", workers=CLUSTERD_WORKERS, + scratch_directory=None)`** in `test/antithesis/mzcompose.py`. Same + configuration as `clusterd1`/`clusterd2`: 16 timely workers per + process (matches the per-process worker density of larger + production cluster sizes — single-process clusterds at workers=16 + cover the same intra-process concurrency surface as a 4-process + scale=4,workers=4 production deployment), mem_env RocksDB (matches + production, no scratch volume to fight over). Pool size from env + (`ANTITHESIS_CLUSTERD_POOL_SIZE`, default 2). - **Pool-cluster bootstrap** in `test/antithesis/workload/workload-entrypoint.sh`. After materialized @@ -132,10 +135,12 @@ cluster identity reconnecting. That's the path reconcile is designed for. problem. - **Pool size much smaller than concurrency.** With C concurrent - invocations and N pool slots, ~C/N invocations share each cluster - in steady state. That's correctness-preserving but increases - per-cluster state pressure linearly with the ratio. Bump - `ANTITHESIS_CLUSTERD_POOL_SIZE` if a single pool cluster runs hot. + invocations and N pool slots (default N=2), ~C/N invocations share + each cluster in steady state. That's correctness-preserving but + increases per-cluster state pressure linearly with the ratio. The + pool is deliberately small so each pool cluster behaves more like + a busy production cluster; bump `ANTITHESIS_CLUSTERD_POOL_SIZE` if + a single pool cluster runs hot enough to mask other signals. ## v1 limitations (future work) @@ -166,7 +171,8 @@ cluster identity reconnecting. That's the path reconcile is designed for. | Variable | Default | Effect | |---|---|---| -| `ANTITHESIS_CLUSTERD_POOL_SIZE` (compose + entrypoint) | 8 | Number of clusterd-pool- containers deployed and matching pool_cluster_ clusters bootstrapped. | -| `CLUSTERD_POOL_SIZE` (driver) | 8 | Number of slots the driver chooses among. Mirrored from compose by mzcompose.py's Workload service so the two agree. | +| `ANTITHESIS_CLUSTERD_POOL_SIZE` (compose + entrypoint) | 2 | Number of clusterd-pool- containers deployed and matching pool_cluster_ clusters bootstrapped. | +| `CLUSTERD_POOL_SIZE` (driver) | 2 | Number of slots the driver chooses among. Mirrored from compose by mzcompose.py's Workload service so the two agree. | +| `CLUSTERD_WORKERS` (compose + entrypoint) | 16 | Timely worker threads per clusterd process. Must match every CREATE CLUSTER REPLICAS' WORKERS clause and every `Clusterd(workers=...)` Service. | | `PW_RUNTIME_S` (driver) | 20 | Per-invocation runtime; bound to keep the fault-injection budget granular. | | `PW_THREADS` (driver) | 4 | Worker threads inside one invocation. | diff --git a/test/antithesis/workload/test/parallel_driver_parallel_workload.py b/test/antithesis/workload/test/parallel_driver_parallel_workload.py index 427a0babc0f16..4f5302c714544 100644 --- a/test/antithesis/workload/test/parallel_driver_parallel_workload.py +++ b/test/antithesis/workload/test/parallel_driver_parallel_workload.py @@ -121,7 +121,7 @@ # faults one container at a time, so the per-container fault domain is # preserved; multiple invocations witnessing the same fault is a # feature (more independent reproductions per failure). -CLUSTERD_POOL_SIZE = int(os.environ.get("CLUSTERD_POOL_SIZE", "8")) +CLUSTERD_POOL_SIZE = int(os.environ.get("CLUSTERD_POOL_SIZE", "2")) def _alter_system(cur: psycopg.Cursor[Any], stmt: str) -> None: diff --git a/test/antithesis/workload/workload-entrypoint.sh b/test/antithesis/workload/workload-entrypoint.sh index bfef3f12e4c1a..ba2b95a8c8b2c 100755 --- a/test/antithesis/workload/workload-entrypoint.sh +++ b/test/antithesis/workload/workload-entrypoint.sh @@ -20,7 +20,13 @@ CLUSTER="${MZ_ANTITHESIS_CLUSTER:-antithesis_cluster}" # Number of long-lived pool clusters to bootstrap, each bound to its own # clusterd-pool-{i} container. Must match `ANTITHESIS_CLUSTERD_POOL_SIZE` # in mzcompose.py and `CLUSTERD_POOL_SIZE` in the parallel-workload driver. -CLUSTERD_POOL_SIZE="${ANTITHESIS_CLUSTERD_POOL_SIZE:-8}" +CLUSTERD_POOL_SIZE="${ANTITHESIS_CLUSTERD_POOL_SIZE:-2}" +# Timely worker threads per clusterd process — must equal the `workers=` +# argument every `Clusterd(...)` Service in mzcompose.py passes, because +# the controller reads worker count from the WORKERS clause we put in +# CREATE CLUSTER REPLICAS, not from clusterd's runtime config. Plumbed +# in via the Workload service's environment. +CLUSTERD_WORKERS="${CLUSTERD_WORKERS:-16}" # Wait for materialized to be ready. echo "Waiting for materialized to become healthy..." @@ -52,14 +58,14 @@ CREATE CLUSTER ${CLUSTER} REPLICAS ( STORAGE ADDRESSES ['clusterd1:2103'], COMPUTECTL ADDRESSES ['clusterd1:2101'], COMPUTE ADDRESSES ['clusterd1:2102'], - WORKERS 4 + WORKERS ${CLUSTERD_WORKERS} ), replica2 ( STORAGECTL ADDRESSES ['clusterd2:2100'], STORAGE ADDRESSES ['clusterd2:2103'], COMPUTECTL ADDRESSES ['clusterd2:2101'], COMPUTE ADDRESSES ['clusterd2:2102'], - WORKERS 4 + WORKERS ${CLUSTERD_WORKERS} ) ); GRANT ALL ON CLUSTER ${CLUSTER} TO ${PGUSER}; @@ -70,12 +76,13 @@ fi # Bootstrap a long-lived `pool_cluster_{i}` for each clusterd-pool-{i} # container. Each pool cluster has exactly one replica wired to its -# matching pool clusterd. Parallel-workload driver invocations claim a -# slot (via fcntl.flock on the workload container's filesystem) and run -# against `pool_cluster_{slot}` for their entire lifetime. The cluster -# identity is tied to the clusterd identity, so reconnects don't trip -# clusterd's `instance configuration not compatible` halt; only the -# seed-scoped database / roles get dropped between invocations. +# matching pool clusterd. Parallel-workload driver invocations pick a +# slot at random and run against `pool_cluster_{slot}`; concurrent +# invocations may share a pool cluster (every workload object is in a +# seed-scoped database so they don't collide). The cluster identity is +# tied to the clusterd identity, so reconnects don't trip clusterd's +# `instance configuration not compatible` halt; only the seed-scoped +# database / roles get dropped between invocations. # # Idempotent: skip pool clusters that already exist (the SUT's catalog # survives across `docker compose up` if metadata volumes aren't wiped). @@ -97,7 +104,7 @@ CREATE CLUSTER ${POOL_CLUSTER} REPLICAS ( STORAGE ADDRESSES ['clusterd-pool-${i}:2103'], COMPUTECTL ADDRESSES ['clusterd-pool-${i}:2101'], COMPUTE ADDRESSES ['clusterd-pool-${i}:2102'], - WORKERS 4 + WORKERS ${CLUSTERD_WORKERS} ) ); GRANT ALL ON CLUSTER ${POOL_CLUSTER} TO ${PGUSER}; From d0aa7fbecc1d5805c8aab1b95f2d4b5707e418f4 Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Thu, 14 May 2026 12:10:48 -0400 Subject: [PATCH 52/65] test/antithesis: add MyISAM cdc table to mysql workload MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Covers the non-transactional DML axis of the MySQL CDC source. Materialize's MySQL source code path is engine-agnostic (binlog ROW events look the same for MyISAM and InnoDB), so this exists to assert the engine-agnostic contract holds in practice when the upstream is MyISAM: * BEGIN/COMMIT around MyISAM statements is silently ignored — each statement commits immediately with its own GTID. * No rollback semantics: a statement killed mid-write leaves whatever rows it managed to insert committed. * Table-level locking instead of row-level. A MyISAM-specific regression would surface as the new driver's 'mysql myisam: CDC source row has correct value after catchup' firing false while the existing InnoDB-backed driver looks healthy. What landed: * first_mysql_replica_setup.py creates antithesis.cdc_test_myisam (ENGINE=MyISAM) alongside the existing cdc_test (ENGINE=InnoDB) on the primary, and waits for both to replicate to the replica. * helper_mysql_source.py exposes MYSQL_TABLE_MYISAM / TABLE_NAME_MYISAM constants and an ensure_mysql_cdc_myisam_table() helper. ensure_mysql_cdc_source() now creates both subsources off the single mysql_cdc_source SOURCE. * parallel_driver_mysql_myisam.py mirrors the InnoDB sibling's shape against the MyISAM subsource with the 'myi-p' batch prefix so the two drivers don't interfere. * Property doc scratchbook/properties/mysql-myisam-cdc-no-data-loss.md captures the bug class, MyISAM-specific binlog semantics, and the assertion list. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../mysql-myisam-cdc-no-data-loss.md | 82 ++++++ .../test/first_mysql_replica_setup.py | 66 +++-- .../workload/test/helper_mysql_source.py | 43 ++- .../test/parallel_driver_mysql_myisam.py | 249 ++++++++++++++++++ 4 files changed, 414 insertions(+), 26 deletions(-) create mode 100644 test/antithesis/scratchbook/properties/mysql-myisam-cdc-no-data-loss.md create mode 100644 test/antithesis/workload/test/parallel_driver_mysql_myisam.py diff --git a/test/antithesis/scratchbook/properties/mysql-myisam-cdc-no-data-loss.md b/test/antithesis/scratchbook/properties/mysql-myisam-cdc-no-data-loss.md new file mode 100644 index 0000000000000..5e5b6fd239f0e --- /dev/null +++ b/test/antithesis/scratchbook/properties/mysql-myisam-cdc-no-data-loss.md @@ -0,0 +1,82 @@ +# mysql-myisam-cdc-no-data-loss + +## Summary + +Every row inserted to a MyISAM table on the MySQL primary must eventually appear, with the correct value, in the Materialize CDC source that reads from the multithreaded replica — under the same guarantees as the InnoDB-backed `mysql-source-no-data-loss`. + +This property exists separately from `mysql-source-no-data-loss` to cover the non-transactional DML axis: MyISAM in MySQL has fundamentally different transactional semantics, and Materialize's MySQL source code path doesn't distinguish engines, so we assert the engine-agnostic contract holds in practice. + +## Why MyISAM is interesting for CDC testing + +MyISAM differs from InnoDB in ways that show up in the binlog event stream: + +* **No multi-statement transactions.** BEGIN/COMMIT around MyISAM statements is silently ignored. Every MyISAM statement is its own implicit transaction and gets its own GTID — there is no bundling of multiple statements under one GTID block. +* **No rollback.** A statement that fails partway through (e.g., a multi-row INSERT killed by a fault between rows 50 and 51) leaves the partial result committed. Whatever rows made it to the engine are durable; nothing rolls back. +* **Table-level locking instead of row-level.** Concurrent writers serialize rather than abort-and-retry. The binlog sees a strict serial order. +* **No crash recovery via redo log.** A crash mid-statement on the primary can leave the on-disk MyISAM table inconsistent with the binlog, but for our purposes we run against a healthy primary and read CDC from the replica, so this only matters under specific Antithesis fault-injection patterns. + +Materialize's MySQL source decodes binlog events without consulting the upstream engine. ROW-format binlog events look identical for MyISAM and InnoDB. The source's expected contract is "every binlog event is reflected in the materialize table"; we assert that this contract holds when the upstream is MyISAM. + +## Code paths + +Same as `mysql-source-no-data-loss`: +- `src/storage/src/source/mysql/replication/partitions.rs` — binlog event decoding, GTID monotonicity check. +- `src/storage/src/source/mysql/snapshot.rs` — initial snapshot from the replica (uses `LOCK TABLES ... READ` for non-transactional engines). +- `src/mysql-util` — connection management. + +No engine-specific code in the source. That's the property we're verifying. + +## How to check it + +Workload procedure (per invocation): +1. Pick a per-invocation `batch_id` prefix (`myi-p`) so concurrent drivers — including the InnoDB sibling — don't collide. +2. Insert 20 rows into `antithesis.cdc_test_myisam` on the MySQL primary. Each INSERT is its own implicit transaction. +3. Record the {id → value} map locally. +4. Request an Antithesis quiet period. +5. Poll `COUNT(*) FROM antithesis_cdc_myisam WHERE batch_id = ?` until it reaches the inserted count or the budget expires. +6. For each row, `SELECT value FROM antithesis_cdc_myisam WHERE id = ?` with `real_time_recency=true`. Assert `value` matches the locally-recorded one. + +## What goes wrong on violation + +Same failure modes as `mysql-source-no-data-loss`: rows missing, rows with wrong values, rows with extra entries. The bug is silent — the workload sees plausible-but-wrong data. + +A MyISAM-specific failure mode worth flagging in triage: if the materialize source were to *accidentally* treat MyISAM events differently (e.g., conflate the lack-of-transaction with the lack-of-event), we'd see consistent under-counting on the MyISAM subsource while the InnoDB sibling looks healthy. + +## Antithesis angle + +The same fault classes that hit `mysql-source-no-data-loss` apply: +- Mysql primary container pause / restart between insert and binlog flush. +- Mysql-replica container pause / restart between binlog ingestion and materialize-side consumption. +- Materialized container pause / restart between CDC ingestion and persist append. +- Clusterd-pool container pause / restart on the cluster running the MySQL source. + +Specifically MyISAM-relevant scenarios: +- A multi-row INSERT killed mid-statement should leave only the rows that actually committed. The replica's binlog reflects exactly those rows. Materialize must see exactly those rows. (The driver inserts row-by-row in a Python loop so we don't directly exercise "kill a multi-row INSERT," but Antithesis can pause the primary between any two row-INSERTs in the loop, achieving the same shape.) +- GTID ordering with MyISAM is per-statement: a workload that interleaves MyISAM and InnoDB writes produces an alternating GTID stream. Materialize must honor that ordering. (The InnoDB sibling driver and this driver run as independent parallel-workload invocations, naturally producing interleaved binlog events.) + +## Dependencies + +- Requires `gtid_mode = ON` and `binlog_format = ROW` on the primary (already set by mzcompose). +- Requires the MyISAM table on the primary AND on the replica (provisioned by `first_mysql_replica_setup.py`). +- The Materialize MySQL source must include the MyISAM table as a referenced subsource (`ensure_mysql_cdc_myisam_table()` in `helper_mysql_source.py`). + +## Existing instrumentation + +None engine-specific. The general `mysql-source-gtid-monotonicity-violation` SUT assertion (introduced 2026-05-14) covers GTID ordering for both engines uniformly. + +## Implementation status + +Implemented as `test/antithesis/workload/test/parallel_driver_mysql_myisam.py`. + +| Message | Type | Fires when | +|---------|------|------------| +| `"mysql myisam: CDC source row has correct value after catchup"` | `always` | Per row, after catchup. False ⟺ row missing or value wrong. | +| `"mysql myisam: CDC source row count matches inserted count after catchup"` | `always` | Per invocation, after catchup. False ⟺ extra or missing rows for this batch. | +| `"mysql myisam: CDC source caught up to all primary inserts after quiet period"` | `sometimes` | Per invocation. Liveness for the catchup gate. | +| `"mysql replica: both cdc_test tables replicated from primary within 90s"` | `sometimes` | Per timeline (fires once from `first_mysql_replica_setup`). Confirms replication is flowing for both engines. | + +Knobs: `ROWS_PER_INVOCATION=20`, `QUIET_PERIOD_S=25`, `CATCHUP_TIMEOUT_S=90.0`. + +## Provenance + +Surfaced by: Data Integrity (engine-agnostic CDC contract). diff --git a/test/antithesis/workload/test/first_mysql_replica_setup.py b/test/antithesis/workload/test/first_mysql_replica_setup.py index ee603e60e88d6..f697334beb51f 100644 --- a/test/antithesis/workload/test/first_mysql_replica_setup.py +++ b/test/antithesis/workload/test/first_mysql_replica_setup.py @@ -40,8 +40,23 @@ def setup_primary() -> None: - """Create the antithesis schema and cdc_test table on the MySQL primary.""" - LOG.info("creating antithesis database and cdc_test table on primary") + """Create the antithesis schema and both cdc_test tables on the MySQL + primary. + + Two tables on different engines so we exercise both the transactional + (InnoDB) and non-transactional (MyISAM) DML paths through the binlog + and the Materialize MySQL source. MyISAM differences worth noting for + triage: + * BEGIN/COMMIT around MyISAM statements is silently ignored — each + statement commits immediately. + * Each MyISAM statement is its own GTID-tagged binlog event (no + bundling into a multi-statement transaction). + * No rollback semantics: a MyISAM statement that fails partway + through leaves whatever rows it managed to write committed. + * No ON UPDATE TIMESTAMP support before MySQL 5.6 — we use a + simpler schema (no updated_at) on MyISAM to avoid version churn. + """ + LOG.info("creating antithesis database and cdc_test tables on primary") helper_mysql.execute_primary("CREATE DATABASE IF NOT EXISTS antithesis") helper_mysql.execute_primary( """ @@ -51,11 +66,21 @@ def setup_primary() -> None: value TEXT NOT NULL, updated_at TIMESTAMP(6) DEFAULT CURRENT_TIMESTAMP(6) ON UPDATE CURRENT_TIMESTAMP(6) - ) + ) ENGINE=InnoDB + """, + database="antithesis", + ) + helper_mysql.execute_primary( + """ + CREATE TABLE IF NOT EXISTS antithesis.cdc_test_myisam ( + id VARCHAR(64) NOT NULL PRIMARY KEY, + batch_id VARCHAR(64) NOT NULL, + value TEXT NOT NULL + ) ENGINE=MyISAM """, database="antithesis", ) - LOG.info("antithesis.cdc_test ready on primary") + LOG.info("antithesis.cdc_test (InnoDB) and cdc_test_myisam (MyISAM) ready on primary") def configure_replica() -> None: @@ -94,26 +119,33 @@ def configure_replica() -> None: LOG.info("MySQL replica started") -def wait_for_replica_table(timeout_s: float = 90.0) -> bool: - """Wait until antithesis.cdc_test is visible on the replica. +def wait_for_replica_tables(timeout_s: float = 90.0) -> bool: + """Wait until both antithesis.cdc_test (InnoDB) and cdc_test_myisam + (MyISAM) are visible on the replica. - Returns True when the table appears (replication is flowing), False on - timeout. + Returns True when both tables appear (replication is flowing across + both engines), False on timeout. """ deadline = time.monotonic() + timeout_s + needed = {"cdc_test", "cdc_test_myisam"} while time.monotonic() < deadline: try: rows = helper_mysql.query_replica( - "SELECT 1 FROM information_schema.tables " - "WHERE table_schema = 'antithesis' AND table_name = 'cdc_test'", + "SELECT table_name FROM information_schema.tables " + "WHERE table_schema = 'antithesis' " + "AND table_name IN ('cdc_test', 'cdc_test_myisam')", ) - if rows: - LOG.info("antithesis.cdc_test visible on replica — replication flowing") + seen = {r[0] for r in rows} + if needed.issubset(seen): + LOG.info( + "antithesis cdc tables visible on replica — replication flowing (%s)", + sorted(seen), + ) return True except Exception as exc: # noqa: BLE001 - LOG.info("waiting for replica table: %s", exc) + LOG.info("waiting for replica tables: %s", exc) time.sleep(2) - LOG.warning("timed out waiting for antithesis.cdc_test on replica") + LOG.warning("timed out waiting for antithesis.cdc_test{,_myisam} on replica") return False @@ -127,10 +159,10 @@ def main() -> int: setup_primary() configure_replica() - replica_ready = wait_for_replica_table() + replica_ready = wait_for_replica_tables() sometimes( replica_ready, - "mysql replica: antithesis.cdc_test replicated from primary within 90s", + "mysql replica: both cdc_test tables replicated from primary within 90s", { "primary": helper_mysql.MYSQL_HOST, "replica": helper_mysql.MYSQL_REPLICA_HOST, @@ -139,7 +171,7 @@ def main() -> int: if not replica_ready: # Proceed anyway — replication may catch up before Materialize tries to # validate the source, but log a warning so triage can correlate. - LOG.warning("replica table not yet visible; proceeding with source creation") + LOG.warning("replica tables not yet visible; proceeding with source creation") ensure_mysql_cdc_source() diff --git a/test/antithesis/workload/test/helper_mysql_source.py b/test/antithesis/workload/test/helper_mysql_source.py index 34323a846faed..b45af21e5a6e7 100644 --- a/test/antithesis/workload/test/helper_mysql_source.py +++ b/test/antithesis/workload/test/helper_mysql_source.py @@ -20,6 +20,14 @@ - CONNECTION antithesis_mysql_conn -> mysql-replica - SOURCE mysql_cdc_source (IN CLUSTER antithesis_cluster) - TABLE antithesis_cdc (REFERENCE antithesis.cdc_test) + - TABLE antithesis_cdc_myisam (REFERENCE antithesis.cdc_test_myisam) + +The MyISAM-backed reference exercises CDC for non-transactional DML: in +MySQL, MyISAM statements commit immediately (BEGIN/COMMIT is silently +ignored), so the binlog sees them as standalone events with their own +GTIDs rather than bundled inside a transaction. Materialize's source +code path doesn't distinguish engines, so this is a property check that +the engine-agnostic behavior actually holds. """ from __future__ import annotations @@ -38,11 +46,13 @@ MYSQL_DATABASE = "antithesis" MYSQL_TABLE = "cdc_test" +MYSQL_TABLE_MYISAM = "cdc_test_myisam" SECRET_NAME = "antithesis_mysql_password" CONNECTION_NAME = "antithesis_mysql_conn" SOURCE_NAME = "mysql_cdc_source" TABLE_NAME = "antithesis_cdc" +TABLE_NAME_MYISAM = "antithesis_cdc_myisam" def ensure_mysql_connection() -> None: @@ -60,30 +70,44 @@ def ensure_mysql_connection() -> None: ) -def ensure_mysql_cdc_table() -> None: - """Create the Materialize table from the MySQL CDC source (idempotent).""" +def _ensure_mysql_cdc_subtable(mz_table: str, upstream_table: str) -> None: + """Create one Materialize table that references `upstream_table` in the + MySQL CDC source (idempotent). Shared between the InnoDB and MyISAM + references; both come from the same source. + """ try: execute_retry( - f"CREATE TABLE IF NOT EXISTS {TABLE_NAME} " + f"CREATE TABLE IF NOT EXISTS {mz_table} " f"FROM SOURCE {SOURCE_NAME} " - f"(REFERENCE {MYSQL_DATABASE}.{MYSQL_TABLE})" + f"(REFERENCE {MYSQL_DATABASE}.{upstream_table})" ) except psycopg.errors.InternalError as exc: if "already exists" not in str(exc): raise - rows = query_retry("SELECT 1 FROM mz_tables WHERE name = %s", (TABLE_NAME,)) + rows = query_retry("SELECT 1 FROM mz_tables WHERE name = %s", (mz_table,)) if rows: - LOG.info("table %s landed concurrently; tolerating collision", TABLE_NAME) + LOG.info("table %s landed concurrently; tolerating collision", mz_table) return raise - LOG.info("mysql cdc table %s ready", TABLE_NAME) + LOG.info("mysql cdc table %s ready (upstream=%s)", mz_table, upstream_table) + + +def ensure_mysql_cdc_table() -> None: + """Create the InnoDB-backed Materialize table from the source.""" + _ensure_mysql_cdc_subtable(TABLE_NAME, MYSQL_TABLE) + + +def ensure_mysql_cdc_myisam_table() -> None: + """Create the MyISAM-backed Materialize table from the source.""" + _ensure_mysql_cdc_subtable(TABLE_NAME_MYISAM, MYSQL_TABLE_MYISAM) def ensure_mysql_cdc_source() -> None: """Create the full MySQL CDC pipeline in Materialize (idempotent). - Requires antithesis.cdc_test to already exist on the MySQL replica. - Call first_mysql_replica_setup.py before this in any standalone use. + Requires antithesis.cdc_test AND antithesis.cdc_test_myisam to already + exist on the MySQL replica. Call first_mysql_replica_setup.py before + this in any standalone use. """ ensure_mysql_connection() create_source_idempotent( @@ -94,3 +118,4 @@ def ensure_mysql_cdc_source() -> None: ) LOG.info("mysql cdc source %s ready", SOURCE_NAME) ensure_mysql_cdc_table() + ensure_mysql_cdc_myisam_table() diff --git a/test/antithesis/workload/test/parallel_driver_mysql_myisam.py b/test/antithesis/workload/test/parallel_driver_mysql_myisam.py new file mode 100644 index 0000000000000..00542bba536bc --- /dev/null +++ b/test/antithesis/workload/test/parallel_driver_mysql_myisam.py @@ -0,0 +1,249 @@ +#!/usr/bin/env python3 +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Antithesis driver for property `mysql-myisam-cdc-no-data-loss`. + +Sibling driver `parallel_driver_mysql_cdc.py` exercises the same property +shape against the InnoDB-backed `antithesis.cdc_test`. This driver +exercises the non-transactional flavor against the MyISAM-backed +`antithesis.cdc_test_myisam`. The Materialize MySQL source code path +doesn't distinguish engines — the binlog/CDC contract is engine-agnostic +— so this is a check that the engine-agnostic behavior actually holds +under non-transactional upstream DML. + +What's different about MyISAM in the binlog: + * BEGIN/COMMIT around MyISAM statements is silently ignored — each + statement commits immediately. + * Every MyISAM statement gets its own GTID (one transaction per + statement, not per BEGIN/COMMIT block). + * No rollback semantics: a statement that fails partway through leaves + whatever rows it managed to insert committed and visible to the + binlog / replica / Materialize source. + * No table-locking deadlock recovery: the storage engine takes + table-level locks, so concurrent writers serialize rather than + abort-and-retry. + +These differences shouldn't affect Materialize's view of the data: every +acknowledged INSERT must appear in the CDC source with the right value. +That's the property this driver asserts, with the same shape as +`parallel_driver_mysql_cdc.py`. + +Each invocation: + 1. Checks the MySQL CDC source and the MyISAM reference table exist. + 2. Picks a per-invocation `batch_id` prefix so concurrent drivers + (including the InnoDB sibling) don't collide. + 3. Inserts ROWS_PER_INVOCATION rows to the MyISAM table on the primary. + 4. Requests an Antithesis quiet period and polls the Materialize source + table until all expected rows appear (or the budget expires). + 5. Asserts correctness via `always(...)` on count and per-row values. +""" + +from __future__ import annotations + +import logging +import sys +import time + +import helper_mysql +import helper_random +from antithesis.assertions import always, sometimes +from helper_mysql_source import ( + MYSQL_DATABASE, + MYSQL_TABLE_MYISAM, + SOURCE_NAME, + TABLE_NAME_MYISAM, +) +from helper_pg import query_retry +from helper_quiet import request_quiet_period + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" +) +LOG = logging.getLogger("driver.mysql_myisam") + +ROWS_PER_INVOCATION = 20 +QUIET_PERIOD_S = 25 +CATCHUP_TIMEOUT_S = 90.0 +POLL_INTERVAL_S = 1.0 + + +def _source_ready() -> bool: + """Source + MyISAM reference table both exist in Materialize.""" + src = query_retry("SELECT 1 FROM mz_sources WHERE name = %s", (SOURCE_NAME,)) + tbl = query_retry("SELECT 1 FROM mz_tables WHERE name = %s", (TABLE_NAME_MYISAM,)) + return bool(src) and bool(tbl) + + +def _insert_rows(batch_id: str) -> dict[str, str]: + """Insert ROWS_PER_INVOCATION rows to the MyISAM table. + + Each insert is its own implicit transaction (MyISAM ignores BEGIN/ + COMMIT). Returns {id → value} for every successfully inserted row. + Failures are logged and skipped: under fault injection the primary + may be unreachable mid-loop, and the property is "every + acknowledged INSERT shows up," not "every attempted INSERT shows up." + """ + expected: dict[str, str] = {} + for i in range(ROWS_PER_INVOCATION): + row_id = f"{batch_id}:{i}" + value = f"v{helper_random.random_int(0, 9999):04d}" + try: + helper_mysql.execute_primary( + f"INSERT INTO {MYSQL_DATABASE}.{MYSQL_TABLE_MYISAM} " + "(id, batch_id, value) VALUES (%s, %s, %s) " + "ON DUPLICATE KEY UPDATE value = VALUES(value), batch_id = VALUES(batch_id)", + (row_id, batch_id, value), + database=MYSQL_DATABASE, + ) + expected[row_id] = value + except Exception as exc: # noqa: BLE001 + LOG.info("MyISAM insert failed for row %s: %s; skipping", row_id, exc) + return expected + + +def _wait_for_catchup(batch_id: str, expected_count: int) -> bool: + """Poll Materialize until all expected rows for `batch_id` appear in + the MyISAM-referenced subsource. + """ + deadline = time.monotonic() + CATCHUP_TIMEOUT_S + last_seen = -1 + while time.monotonic() < deadline: + try: + rows = query_retry( + f"SELECT COUNT(*)::bigint FROM {TABLE_NAME_MYISAM} WHERE batch_id = %s", + (batch_id,), + ) + count = int(rows[0][0]) if rows and rows[0][0] is not None else 0 + except Exception as exc: # noqa: BLE001 + LOG.info("catchup poll failed: %s; retrying", exc) + time.sleep(POLL_INTERVAL_S) + continue + + if count != last_seen: + LOG.info( + "mysql myisam catchup: batch=%s observed=%d target=%d", + batch_id, + count, + expected_count, + ) + last_seen = count + + if count >= expected_count: + return True + time.sleep(POLL_INTERVAL_S) + + LOG.warning( + "mysql myisam catchup timeout: batch=%s last_seen=%d target=%d", + batch_id, + last_seen, + expected_count, + ) + return False + + +def _check_rows(expected: dict[str, str]) -> None: + """Assert every expected row has the correct value in the Materialize + MyISAM-referenced subsource. Uses real_time_recency so the per-row + SELECT chosen-ts waits for the MySQL source's real-time upstream + frontier; the count-based catchup above can clear at a chosen-ts that + just barely satisfies the COUNT, leaving a per-row SELECT moments + later to race. + """ + for row_id, want in expected.items(): + rows = query_retry( + f"SELECT value FROM {TABLE_NAME_MYISAM} WHERE id = %s", + (row_id,), + real_time_recency=True, + ) + found = bool(rows) + observed = rows[0][0] if found else None + always( + found and observed == want, + "mysql myisam: CDC source row has correct value after catchup", + { + "source": TABLE_NAME_MYISAM, + "id": row_id, + "expected_value": want, + "observed_present": found, + "observed_value": observed, + }, + ) + + +def main() -> int: + if not _source_ready(): + # first_mysql_replica_setup must run before this driver. Outside + # Antithesis (e.g. snouty validate) the source / MyISAM table may + # not exist yet — exit cleanly rather than erroring so validate + # can still proceed. + LOG.warning( + "mysql cdc source %s or MyISAM table %s not found; skipping " + "(first_mysql_replica_setup must run first)", + SOURCE_NAME, + TABLE_NAME_MYISAM, + ) + return 0 + + batch_id = f"myi-p{helper_random.random_u64():016x}" + LOG.info("driver starting; batch_id=%s", batch_id) + + expected = _insert_rows(batch_id) + if not expected: + LOG.info("no rows inserted successfully this invocation; exiting cleanly") + return 0 + + LOG.info("inserted %d rows; requesting quiet period", len(expected)) + request_quiet_period(QUIET_PERIOD_S) + + caught_up = _wait_for_catchup(batch_id, len(expected)) + + sometimes( + caught_up, + "mysql myisam: CDC source caught up to all primary inserts after quiet period", + { + "source": TABLE_NAME_MYISAM, + "batch_id": batch_id, + "rows_inserted": len(expected), + }, + ) + + if not caught_up: + LOG.info("catchup did not complete in budget; skipping per-row assertions") + return 0 + + _check_rows(expected) + + rows = query_retry( + f"SELECT COUNT(*)::bigint FROM {TABLE_NAME_MYISAM} WHERE batch_id = %s", + (batch_id,), + real_time_recency=True, + ) + count_in_mz = int(rows[0][0]) if rows and rows[0][0] is not None else 0 + always( + count_in_mz == len(expected), + "mysql myisam: CDC source row count matches inserted count after catchup", + { + "source": TABLE_NAME_MYISAM, + "batch_id": batch_id, + "expected_count": len(expected), + "observed_count": count_in_mz, + }, + ) + + LOG.info( + "driver done; asserted on %d MyISAM rows for batch_id=%s", + len(expected), + batch_id, + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From 8060eb2b264c89c7c2ce0739c9bbcd84999a77c8 Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Thu, 14 May 2026 12:53:19 -0400 Subject: [PATCH 53/65] test/antithesis: per-service container_name + hostname + explicit bridge network MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two changes the Antithesis platform needs from the docker-compose YAML, applied together at export time so they stay in lockstep: 1. Set `container_name` and `hostname` on every service (matching the service key). Per Antithesis docker best practices, triage reports attribute log lines and assertions by hostname; without an explicit hostname the platform infers one (possibly the container id) that's harder to recognize. The workload container is the highest-value case but the rule is uniform. 2. Define a named bridge network (`antithesis-net`) at the top level and put every service on it. Relying on docker-compose's auto- generated `default` network was leaving DNS resolution up to whatever the surrounding Antithesis orchestration decided; an earlier run on this stack failed with kafka unable to resolve `zookeeper` (UnknownHostException) during setup. Antithesis support pointed at the network shape as the likely cause and suggested declaring it explicitly. Not setting `internal: true` per Antithesis docker best practices — that would cut us off from the Antithesis- side instrumentation network. Both transforms live in export-compose.py so they apply uniformly to every present and future service. Sanity-check that no service key contains an underscore (RFC-1123); all current keys already use hyphens. Co-Authored-By: Claude Opus 4.7 (1M context) --- test/antithesis/config/docker-compose.yaml | 56 ++++++++++++++++++- test/antithesis/export-compose.py | 65 +++++++++++++++++++++- 2 files changed, 117 insertions(+), 4 deletions(-) diff --git a/test/antithesis/config/docker-compose.yaml b/test/antithesis/config/docker-compose.yaml index 8b162e1224a78..a7033da124574 100644 --- a/test/antithesis/config/docker-compose.yaml +++ b/test/antithesis/config/docker-compose.yaml @@ -68,6 +68,10 @@ services: exec docker-entrypoint.sh "$$@"' - -- + container_name: postgres-metadata + hostname: postgres-metadata + networks: + - antithesis-net minio: entrypoint: - sh @@ -93,6 +97,10 @@ services: start_period: 30s platform: linux/amd64 image: minio/minio:latest + container_name: minio + hostname: minio + networks: + - antithesis-net zookeeper: image: confluentinc/cp-zookeeper:7.9.4 ports: @@ -109,6 +117,10 @@ services: interval: 1s start_period: 120s platform: linux/amd64 + container_name: zookeeper + hostname: zookeeper + networks: + - antithesis-net kafka: image: confluentinc/cp-kafka:7.9.4 ports: @@ -140,13 +152,16 @@ services: interval: 1s start_period: 120s platform: linux/amd64 + container_name: kafka + hostname: kafka + networks: + - antithesis-net schema-registry: image: confluentinc/cp-schema-registry:7.9.4 ports: - 8081 networks: - default: - aliases: [] + - antithesis-net environment: - SCHEMA_REGISTRY_KAFKASTORE_TIMEOUT_MS=10000 - SCHEMA_REGISTRY_KAFKASTORE_TOPIC_REPLICATION_FACTOR=1 @@ -171,6 +186,8 @@ services: interval: 1s start_period: 120s platform: linux/amd64 + container_name: schema-registry + hostname: schema-registry mysql: init: true ports: @@ -201,6 +218,10 @@ services: - mydata:/var/lib/mysql-files image: mysql:9.5.0 platform: linux/amd64 + container_name: mysql + hostname: mysql + networks: + - antithesis-net mysql-replica: init: true ports: @@ -235,6 +256,10 @@ services: - mydata:/var/lib/mysql-files image: mysql:9.5.0 platform: linux/amd64 + container_name: mysql-replica + hostname: mysql-replica + networks: + - antithesis-net clusterd1: entrypoint: - tini @@ -273,6 +298,10 @@ services: stop_grace_period: 120s platform: linux/amd64 image: ${MATERIALIZED_IMAGE} + container_name: clusterd1 + hostname: clusterd1 + networks: + - antithesis-net clusterd2: entrypoint: - tini @@ -311,6 +340,10 @@ services: stop_grace_period: 120s platform: linux/amd64 image: ${MATERIALIZED_IMAGE} + container_name: clusterd2 + hostname: clusterd2 + networks: + - antithesis-net clusterd-pool-0: entrypoint: - tini @@ -349,6 +382,10 @@ services: stop_grace_period: 120s platform: linux/amd64 image: ${MATERIALIZED_IMAGE} + container_name: clusterd-pool-0 + hostname: clusterd-pool-0 + networks: + - antithesis-net clusterd-pool-1: entrypoint: - tini @@ -387,6 +424,10 @@ services: stop_grace_period: 120s platform: linux/amd64 image: ${MATERIALIZED_IMAGE} + container_name: clusterd-pool-1 + hostname: clusterd-pool-1 + networks: + - antithesis-net materialized: hostname: materialized depends_on: @@ -575,6 +616,9 @@ services: stop_grace_period: 120s platform: linux/amd64 image: ${MATERIALIZED_IMAGE} + container_name: materialized + networks: + - antithesis-net workload: depends_on: materialized: @@ -608,7 +652,13 @@ services: - MYSQL_PASSWORD=p@ssw0rd platform: linux/amd64 image: ${ANTITHESIS_WORKLOAD_IMAGE} -networks: {} + container_name: workload + hostname: workload + networks: + - antithesis-net +networks: + antithesis-net: + driver: bridge volumes: mzdata: null pgdata: null diff --git a/test/antithesis/export-compose.py b/test/antithesis/export-compose.py index a204a76fdbf87..f7155ba31c51d 100644 --- a/test/antithesis/export-compose.py +++ b/test/antithesis/export-compose.py @@ -189,6 +189,66 @@ def strip_mzcompose_keys(svc: dict[str, Any]) -> None: svc.pop(key, None) +# Single user-defined bridge network every service joins. Defining the +# network explicitly (rather than relying on docker-compose's auto- +# generated `default`) gives us deterministic container-DNS regardless +# of how the Antithesis platform's surrounding orchestration parses the +# compose file. Antithesis support flagged the auto-network as a likely +# cause of a kafka -> zookeeper UnknownHostException during setup; the +# fix is to make the network explicit. +# +# Must NOT set `internal: true` per Antithesis docker best practices — +# that would cut us off from the Antithesis-side network used for +# instrumentation. Plain bridge is the recommended shape. +ANTITHESIS_NETWORK = "antithesis-net" + + +def assign_network(svc: dict[str, Any]) -> None: + """Place the service on the single named bridge network so docker-DNS + is deterministic. Overwrites any pre-existing `networks` entry — some + upstream Service classes set a vestigial `default: aliases: []` block + that we don't want carried through. + """ + svc["networks"] = [ANTITHESIS_NETWORK] + + +def declare_top_level_network(compose: dict[str, Any]) -> None: + """Declare the bridge network at the compose top level. Overwrites any + pre-existing top-level `networks:` entry (mzcompose currently emits + an empty dict). + """ + compose["networks"] = { + ANTITHESIS_NETWORK: {"driver": "bridge"}, + } + + +def set_explicit_names(name: str, svc: dict[str, Any]) -> None: + """Set `container_name` and `hostname` to the service key. + + Per Antithesis docker best practices (https://antithesis.com/docs/ + best_practices/docker_best_practices/), every service should declare + its container_name and hostname explicitly and use the same value + for both. Triage reports attribute log lines and assertions by + `hostname`; if it isn't set, Antithesis falls back to an inferred + value (possibly the container id) that's harder to recognize. + + Set here at export time rather than per-service in mzcompose.py so + that local mzcompose runs aren't constrained to one global + container_name namespace. + + Asserts the service key is DNS-safe (no underscores, RFC-1123). + Docker Compose itself rejects underscored service keys, so this is + a sanity check, not a transform. + """ + if "_" in name: + raise ValueError( + f"service {name!r}: underscores in hostnames break DNS resolution " + f"under Antithesis (RFC-1123). Rename the service to use hyphens." + ) + svc["container_name"] = name + svc["hostname"] = name + + def register_referenced_named_volumes(compose: dict[str, Any]) -> None: """Declare any named volume referenced by a service that isn't already declared at the top level. Docker Compose rejects the file otherwise. @@ -223,7 +283,7 @@ def main() -> None: repo = Repository(Path("."), arch=Arch.X86_64, antithesis=True) c = Composition(repo, "antithesis", munge_services=False) - for svc in c.compose["services"].values(): + for name, svc in c.compose["services"].items(): svc["platform"] = "linux/amd64" if "mzbuild" in svc: resolve_mzbuild(svc) @@ -231,7 +291,10 @@ def main() -> None: strip_host_bindmounts(svc) strip_incompatible_env(svc) strip_mzcompose_keys(svc) + set_explicit_names(name, svc) + assign_network(svc) + declare_top_level_network(c.compose) register_referenced_named_volumes(c.compose) sys.stdout.write(HEADER) From 7d5aa56b5918ef9a1fda9ccfce1f50acb131ec4b Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Thu, 14 May 2026 16:40:03 -0400 Subject: [PATCH 54/65] test/antithesis: gate service_started depends_on on healthcheck when available MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Data-driven export-compose transform: for every depends_on entry that uses `condition: service_started` against a dependency that declares a `healthcheck`, upgrade the condition to `service_healthy`. Dependencies without a healthcheck (currently only clusterd) are left as `service_started` since there's nothing to wait on. Under the Antithesis platform, `service_started` proved unreliable as a readiness gate during initial container startup. Docker fires it as soon as the dependency's container process starts, before the dependency's DNS entry is reliably resolvable. The previous run on the fault-isolated topology saw kafka hit `java.net.UnknownHostException: zookeeper: Name or service not known` 148+ times in a row before its retry loop landed on a successful lookup, with the same cascade downstream (schema-registry ↔ kafka). Both containers exited with code 1 from those retries, tripping the "No unexpected container exits" property. Upgraded edges: kafka -> zookeeper (zookeeper:2181 nc healthcheck) schema-registry -> kafka (kafka:9092 nc healthcheck) materialized -> minio (minio /minio/health/live curl) workload -> schema-registry (schema-registry curl healthcheck) Left alone: workload -> clusterd{1,2} (no clusterd healthcheck) Gating on the healthcheck (which probes the actual listen port) eliminates the DNS-race shape because docker won't fire `service_healthy` until the dependency is answering on its port — and DNS is reliably resolvable by then. Co-Authored-By: Claude Opus 4.7 (1M context) --- test/antithesis/config/docker-compose.yaml | 8 ++--- test/antithesis/export-compose.py | 37 ++++++++++++++++++++++ 2 files changed, 41 insertions(+), 4 deletions(-) diff --git a/test/antithesis/config/docker-compose.yaml b/test/antithesis/config/docker-compose.yaml index a7033da124574..8201cb9a11e7f 100644 --- a/test/antithesis/config/docker-compose.yaml +++ b/test/antithesis/config/docker-compose.yaml @@ -141,7 +141,7 @@ services: - KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR=1 depends_on: zookeeper: - condition: service_started + condition: service_healthy healthcheck: test: - CMD @@ -175,7 +175,7 @@ services: && exec /etc/confluent/docker/launch depends_on: kafka: - condition: service_started + condition: service_healthy healthcheck: test: - CMD @@ -432,7 +432,7 @@ services: hostname: materialized depends_on: minio: - condition: service_started + condition: service_healthy postgres-metadata: condition: service_healthy command: @@ -630,7 +630,7 @@ services: kafka: condition: service_healthy schema-registry: - condition: service_started + condition: service_healthy mysql: condition: service_healthy mysql-replica: diff --git a/test/antithesis/export-compose.py b/test/antithesis/export-compose.py index f7155ba31c51d..b1921f10474be 100644 --- a/test/antithesis/export-compose.py +++ b/test/antithesis/export-compose.py @@ -249,6 +249,42 @@ def set_explicit_names(name: str, svc: dict[str, Any]) -> None: svc["hostname"] = name +def upgrade_started_to_healthy(compose: dict[str, Any]) -> None: + """For every `depends_on` entry that uses `condition: service_started` + against a dependency that declares a `healthcheck`, upgrade the + condition to `service_healthy`. + + Under the Antithesis platform, `service_started` proved unreliable as + a readiness gate during initial container startup: docker fires it as + soon as the dependency's container *process* starts, before the + dependency's DNS entry is reliably resolvable. The first run on the + fault-isolated topology saw kafka hit `UnknownHostException: zookeeper` + 148+ times in a row before its retry loop landed on a successful + lookup, with the same cascade downstream (schema-registry ↔ kafka). + Gating on the healthcheck (which probes the actual listen port) + eliminates that race. + + Dependencies without a healthcheck (e.g. clusterd, which has no + readiness signal we currently expose) are left as `service_started` + — there's nothing to wait on. + """ + services = compose.get("services", {}) + has_healthcheck = { + name for name, svc in services.items() if "healthcheck" in svc + } + for svc in services.values(): + deps = svc.get("depends_on") + if not isinstance(deps, dict): + continue + for dep_name, dep_spec in deps.items(): + if ( + isinstance(dep_spec, dict) + and dep_spec.get("condition") == "service_started" + and dep_name in has_healthcheck + ): + dep_spec["condition"] = "service_healthy" + + def register_referenced_named_volumes(compose: dict[str, Any]) -> None: """Declare any named volume referenced by a service that isn't already declared at the top level. Docker Compose rejects the file otherwise. @@ -295,6 +331,7 @@ def main() -> None: assign_network(svc) declare_top_level_network(c.compose) + upgrade_started_to_healthy(c.compose) register_referenced_named_volumes(c.compose) sys.stdout.write(HEADER) From 54fdf00287af4b099ed6184863cc9e3b4e511e05 Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Thu, 14 May 2026 16:48:00 -0400 Subject: [PATCH 55/65] test/antithesis: route every workload draw through Antithesis SDK MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Antithesis feedback noted that parallel_driver_parallel_workload pulls one u64 from the SDK, seeds a stdlib `random.Random`, and then makes every downstream decision deterministically off that seed — locking the fuzzer out of all branches in the framework's action/expression subtree. Add `AntithesisRandom`, a `random.Random` subclass that overrides `getrandbits()` and `random()` to draw from the Antithesis SDK on every call. Plug it into `parallel_driver_parallel_workload` so action selection, DDL choices, expression shape, sample sizes, and every other in-framework `self.rng.*` call route through the SDK per draw. Each worker thread gets its own instance. Also add `random_float(low, high)` in helper_random — needed by the follow-up commit that swarms `TOMBSTONE_PROB`/`DROP_PROBABILITY` across invocations. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../antithesis/workload/test/helper_random.py | 95 +++++++++++++++++-- .../test/parallel_driver_parallel_workload.py | 25 +++-- 2 files changed, 105 insertions(+), 15 deletions(-) diff --git a/test/antithesis/workload/test/helper_random.py b/test/antithesis/workload/test/helper_random.py index cb749227d6f17..4900778f8b6ab 100644 --- a/test/antithesis/workload/test/helper_random.py +++ b/test/antithesis/workload/test/helper_random.py @@ -7,11 +7,26 @@ # the Business Source License, use of this software will be governed # by the Apache License, Version 2.0. -"""Deterministic randomness for Antithesis drivers. - -All driver randomness must go through the Antithesis SDK so timelines replay -deterministically. Outside Antithesis we fall back to the stdlib `random` with a -fixed-but-arbitrary seed per process so local runs are not flaky. +"""Antithesis randomness primitives for drivers. + +Two layers: + + * Free functions (`random_u64`, `random_int`, `random_bool`, `random_choice`, + `random_float`) for direct use in driver code. Each call draws fresh + entropy from the Antithesis SDK so different timelines see different + values at the same call site — that's how the fuzzer drives coverage. + + * `AntithesisRandom`, a `random.Random` subclass that routes every + `getrandbits()` and `random()` call through the SDK. Use it when + handing an rng to code that expects a `random.Random` (notably + `materialize.parallel_workload`'s `Worker`/`Action`). Seeding a stdlib + `random.Random` from a single SDK draw and then making every + subsequent decision deterministic locks the fuzzer out of every + branch in that subtree; this class avoids that. + +Outside Antithesis (e.g. snouty local validate) the SDK is unavailable; +the helpers and the subclass fall back to a stdlib `Random` seeded from +`os.urandom` so local runs are non-deterministic but functional. """ from __future__ import annotations @@ -19,7 +34,7 @@ import os import random as _stdlib_random from collections.abc import Sequence -from typing import TypeVar +from typing import Any, TypeVar try: from antithesis import random as _ar @@ -30,8 +45,8 @@ T = TypeVar("T") -# A stable per-process seed so local snouty validate runs are deterministic -# within one process but pick a different sequence per process invocation. +# Fallback rng for non-Antithesis runs. Seeded once at import time from +# the OS entropy pool so each process picks a different sequence. _FALLBACK = _stdlib_random.Random(int.from_bytes(os.urandom(8), "little")) @@ -60,5 +75,67 @@ def random_int(low: int, high: int) -> int: def random_bool(true_prob: float) -> bool: if not 0.0 <= true_prob <= 1.0: raise ValueError("true_prob out of range") - # Use 16 bits of entropy to avoid floating-point quirks under replay. + # 16 bits of entropy avoids floating-point quirks under replay. return (random_u64() & 0xFFFF) < int(true_prob * 0x10000) + + +def random_float(low: float, high: float) -> float: + """Uniform draw from [low, high). Useful for swarm parameters where + each driver invocation should pick its own probability/weight value + so different timelines explore different workload mixes.""" + if low > high: + raise ValueError("low > high") + # 53 bits is the precision of a Python float's mantissa; matches what + # stdlib `random.random()` returns. + unit = random_u64() >> 11 + fraction = unit / (1 << 53) + return low + fraction * (high - low) + + +class AntithesisRandom(_stdlib_random.Random): + """A `random.Random` whose every draw comes from the Antithesis SDK. + + The CPython `random.Random` API routes `choice`, `randint`, + `randrange`, `sample`, `shuffle`, etc. through `getrandbits()`, and + `random()` is its only floating-point primitive. Overriding both + here means anything handed an `AntithesisRandom` exercises Antithesis + entropy at every decision point, not just once per seed. + + Outside Antithesis we delegate to the module-level `_FALLBACK` so + local runs still produce values; instances share that fallback + rather than each carrying their own state. + + `seed()` is intentionally a no-op: a Mersenne-Twister-style seed + isn't meaningful when entropy is supplied per-draw. `getstate` / + `setstate` raise because the SDK's internal state isn't observable + or restorable. + """ + + def random(self) -> float: + # Match stdlib `Random.random()` width: top 53 bits of a u64. + return (random_u64() >> 11) / (1 << 53) + + def getrandbits(self, k: int) -> int: + if k <= 0: + raise ValueError("number of bits must be greater than zero") + # Pull 64-bit chunks until we have at least k bits, then shift the + # surplus off the bottom so the result is in [0, 2**k). + nchunks = (k + 63) // 64 + bits = 0 + for _ in range(nchunks): + bits = (bits << 64) | random_u64() + return bits >> (nchunks * 64 - k) + + def seed(self, *args: Any, **kwargs: Any) -> None: + # Entropy comes from the SDK per call; nothing to seed. + return None + + def getstate(self) -> Any: + raise NotImplementedError( + "AntithesisRandom has no snapshottable state; the SDK owns it" + ) + + def setstate(self, state: Any) -> None: + raise NotImplementedError( + "AntithesisRandom has no restorable state; the SDK owns it" + ) diff --git a/test/antithesis/workload/test/parallel_driver_parallel_workload.py b/test/antithesis/workload/test/parallel_driver_parallel_workload.py index 4f5302c714544..945fd0805e515 100644 --- a/test/antithesis/workload/test/parallel_driver_parallel_workload.py +++ b/test/antithesis/workload/test/parallel_driver_parallel_workload.py @@ -40,7 +40,6 @@ import logging import os -import random import sys import threading import time @@ -426,18 +425,27 @@ def _create_database_for_antithesis(database: Database, exe: Executor) -> None: def _spawn_workers( - rng: random.Random, + rng: helper_random.AntithesisRandom, database: Database, end_time: float, num_threads: int, ) -> tuple[list[Worker], list[threading.Thread]]: """Build the same thread pool `parallel_workload.run()` does for - `Complexity.DDL`, minus the per-scenario kill/cancel/backup helper.""" + `Complexity.DDL`, minus the per-scenario kill/cancel/backup helper. + + Each worker gets its own `AntithesisRandom` instance so the framework's + per-Action `self.rng.choice/randint/random/sample` calls route through + Antithesis on every draw. The framework expects a `random.Random`; + `AntithesisRandom` is a subclass that overrides the entropy primitives + to read from the SDK, so action selection, expression shape, DDL + choices, and every other decision are driven by the fuzzer instead of + being locked in after one seed. + """ weights = [60, 30, 30, 30, 100] workers: list[Worker] = [] threads: list[threading.Thread] = [] for i in range(num_threads): - worker_rng = random.Random(rng.randrange(1_000_000)) + worker_rng = helper_random.AntithesisRandom() action_list = worker_rng.choices( [ read_action_list, @@ -475,7 +483,12 @@ def _spawn_workers( def main() -> int: seed = str(helper_random.random_u64()) - rng = random.Random(seed) + # AntithesisRandom routes every getrandbits/random call through the + # Antithesis SDK, so every decision the parallel_workload framework + # makes downstream of this rng draws fresh entropy on each call. A + # stdlib `random.Random(seed)` would lock the timeline in after one + # draw and the fuzzer couldn't drive differing branches. + rng = helper_random.AntithesisRandom() LOG.info( "parallel-workload starting: seed=%s threads=%d runtime=%ss", @@ -513,7 +526,7 @@ def main() -> int: def _run_invocation( seed: str, - rng: random.Random, + rng: helper_random.AntithesisRandom, cluster_name: str, ) -> int: """The bulk of `main()` once a pool slot has been claimed. Split out From 63e10740ad74c956b5711adf614ad9ab65ceac42 Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Thu, 14 May 2026 16:52:24 -0400 Subject: [PATCH 56/65] test/antithesis: swarm tombstone / drop probabilities per invocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Antithesis feedback called out hardcoded probability constants (TOMBSTONE_PROB, DROP_PROBABILITY) as missed swarm-testing opportunities — every timeline ran the exact same workload mix instead of letting the fuzzer drive the parameter. Replace the three hardcoded constants with per-invocation draws from helper_random.random_float() over sensible ranges: parallel_driver_upsert_latest_value: TOMBSTONE_PROB 0.15 -> random_float(0.05, 0.50) singleton_driver_upsert_state_rehydration: TOMBSTONE_PROB 0.20 -> random_float(0.05, 0.50) (fixed per run so cross-cycle stability of `expected` still tests rehydration) singleton_driver_catalog_recovery_consistency: DROP_PROBABILITY 0.20 -> random_float(0.10, 0.50) The draw happens once at the top of main() and is logged for triage. Each timeline ends up with a different mix; the fuzzer is free to push toward whichever extreme reveals a bug. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../parallel_driver_upsert_latest_value.py | 20 +++++++++++-- ...ton_driver_catalog_recovery_consistency.py | 20 ++++++++++--- ...ngleton_driver_upsert_state_rehydration.py | 28 +++++++++++++++---- 3 files changed, 56 insertions(+), 12 deletions(-) diff --git a/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py b/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py index fcfabea77620d..68734c7f03c82 100755 --- a/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py +++ b/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py @@ -58,7 +58,14 @@ PRODUCES_PER_INVOCATION = 40 DISTINCT_KEYS = 8 # small key space so we re-write the same key often DISTINCT_VALUES = 16 -TOMBSTONE_PROB = 0.15 + +# Tombstone probability is drawn per-invocation in main() from a wide +# range. Different timelines see different mixes — heavy-tombstone runs +# stress upsert removal, mostly-live runs stress value-overwrite — and +# the fuzzer drives which one each timeline gets. A fixed constant would +# make every invocation identical in this respect and waste fuzzer +# budget on the same workload shape. +TOMBSTONE_PROB_RANGE = (0.05, 0.50) QUIET_PERIOD_S = 20 CATCHUP_TIMEOUT_S = 60.0 @@ -113,7 +120,14 @@ def main() -> int: # Per-invocation prefix isolates this driver's keys from other concurrent # drivers and from previous invocations of this same driver. prefix = f"p{helper_random.random_u64():016x}" - LOG.info("driver starting; prefix=%s", prefix) + + # Swarm: pick this invocation's tombstone fraction from the configured + # range. The fuzzer sees this as one of the first decisions in the + # timeline and can drive it toward whichever extreme reveals a bug. + tombstone_prob = helper_random.random_float(*TOMBSTONE_PROB_RANGE) + LOG.info( + "driver starting; prefix=%s tombstone_prob=%.3f", prefix, tombstone_prob + ) producer, tracker = make_producer(client_id=f"antithesis-{prefix}") @@ -131,7 +145,7 @@ def main() -> int: keys = [f"{prefix}-k{i}" for i in range(DISTINCT_KEYS)] for _ in range(PRODUCES_PER_INVOCATION): key = helper_random.random_choice(keys) - if helper_random.random_bool(TOMBSTONE_PROB): + if helper_random.random_bool(tombstone_prob): if expected.get(key) is not None: tombstoned_after_value += 1 _produce(producer, tracker, TOPIC_UPSERT_TEXT, key, None) diff --git a/test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py b/test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py index 53e791185b4ab..5612a19c30ea8 100755 --- a/test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py +++ b/test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py @@ -73,9 +73,13 @@ # mid-DDL still resolves before the next cycle. CYCLE_COUNT high enough to # give Antithesis multiple windows to land a restart between cycles. CYCLE_COUNT = 10 -DROP_PROBABILITY = 0.20 INTER_CYCLE_SLEEP_S = 2.0 +# Drop fraction is swarmed per-invocation in main(). Wide range so different +# timelines exercise create-heavy (catalog grows) and drop-heavy (churn- +# through-recovery) modes without rebuilding the driver. +DROP_PROBABILITY_RANGE = (0.10, 0.50) + PROBE_CONNECT_TIMEOUT_S = 2.0 @@ -134,6 +138,7 @@ def _run_cycle( name_prefix: str, cycle_idx: int, next_id: int, + drop_probability: float, ) -> tuple[bool, int]: """One create-or-drop + verify cycle. @@ -150,7 +155,7 @@ def _run_cycle( missing from the post-recovery catalog. """ new_id = next_id - if expected and helper_random.random_bool(DROP_PROBABILITY): + if expected and helper_random.random_bool(drop_probability): # Drop a random existing table. Choosing from `expected` keeps the # drop deterministic w.r.t. the local model. table = sorted(expected)[helper_random.random_int(0, len(expected) - 1)] @@ -210,7 +215,12 @@ def main() -> int: # Per-timeline namespace so concurrent timelines and any future # parallel_driver_ instances do not collide on table names. name_prefix = f"catrec_{helper_random.random_u64():016x}" - LOG.info("catalog recovery driver starting; name_prefix=%s", name_prefix) + drop_probability = helper_random.random_float(*DROP_PROBABILITY_RANGE) + LOG.info( + "catalog recovery driver starting; name_prefix=%s drop_probability=%.3f", + name_prefix, + drop_probability, + ) expected: set[str] = set() next_id = 0 @@ -218,7 +228,9 @@ def main() -> int: saw_coord_unavailable = False for cycle_idx in range(CYCLE_COUNT): - ran, next_id = _run_cycle(expected, name_prefix, cycle_idx, next_id) + ran, next_id = _run_cycle( + expected, name_prefix, cycle_idx, next_id, drop_probability + ) if ran: cycles_ran += 1 if _saw_coord_unavailable(): diff --git a/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py b/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py index 26342d0ed43e8..3c9876ba79988 100755 --- a/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py +++ b/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py @@ -81,7 +81,14 @@ PRODUCES_PER_CYCLE = 30 DISTINCT_KEYS = 6 DISTINCT_VALUES = 12 -TOMBSTONE_PROB = 0.20 + +# Tombstone fraction is swarmed once per driver invocation (see main()) so +# different timelines exercise different live/dead mixes — heavy-tombstone +# runs stress the upsert-state-remove rehydration path, mostly-live runs +# stress value-overwrite rehydration. The choice is fixed for the whole +# driver lifetime so cross-cycle stability of `expected` still tests +# rehydration, not just per-cycle convergence. +TOMBSTONE_PROB_RANGE = (0.05, 0.50) QUIET_PERIOD_S = 25 CATCHUP_TIMEOUT_S = 120.0 @@ -113,7 +120,11 @@ def _select_value_for_key(key: str) -> tuple[bool, str | None]: def _run_cycle( - producer, tracker, expected: dict[str, str | None], cycle_idx: int + producer, + tracker, + expected: dict[str, str | None], + cycle_idx: int, + tombstone_prob: float, ) -> bool: """Produce one batch, settle, and assert state for every tracked key. @@ -122,7 +133,7 @@ def _run_cycle( keys = [f"reh-k{i}" for i in range(DISTINCT_KEYS)] for _ in range(PRODUCES_PER_CYCLE): key = helper_random.random_choice(keys) - if helper_random.random_bool(TOMBSTONE_PROB): + if helper_random.random_bool(tombstone_prob): producer.produce( topic=TOPIC_UPSERT_TEXT, key=key.encode("utf-8"), @@ -201,7 +212,14 @@ def _run_cycle( def main() -> int: ensure_upsert_text_source() - LOG.info("rehydration driver starting; %d cycles planned", CYCLE_COUNT) + # Swarm once per invocation, fixed for the run so cross-cycle stability + # of `expected` keeps testing rehydration rather than per-cycle drift. + tombstone_prob = helper_random.random_float(*TOMBSTONE_PROB_RANGE) + LOG.info( + "rehydration driver starting; %d cycles planned tombstone_prob=%.3f", + CYCLE_COUNT, + tombstone_prob, + ) producer, tracker = make_producer(client_id="antithesis-rehydration") expected: dict[str, str | None] = {} @@ -209,7 +227,7 @@ def main() -> int: cycles_run = 0 for cycle_idx in range(CYCLE_COUNT): - if _run_cycle(producer, tracker, expected, cycle_idx): + if _run_cycle(producer, tracker, expected, cycle_idx, tombstone_prob): cycles_run += 1 time.sleep(INTER_CYCLE_SLEEP_S) From 6479ea8f391f05a584dcdae30b3dd0cc4e357b9a Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Thu, 14 May 2026 16:54:44 -0400 Subject: [PATCH 57/65] test/antithesis: move quiet/active windows to a global fault-orchestrator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Antithesis feedback: with every parallel driver requesting its own `ANTITHESIS_STOP_FAULTS` window, the union of overlapping per-driver quiet periods leaves the SUT mostly un-faulted. Faults should arrive on a single coordinated cadence driven from a dedicated container, and workloads should stay robust to whatever quiet/faulting transitions the orchestrator picks — the catchup-then-assert pattern already there fits that model. Topology change: add a `fault-orchestrator` service backed by `bash:5` running `test/antithesis/fault-orchestrator/pause_faults.sh`, adapted from the Antithesis hands-on tutorial. It alternates faults-OFF/faults-ON windows at randomised intervals (START_DELAY=30, MIN_ON/MAX_ON/MIN_OFF/MAX_OFF=20-40), centralising the cadence. Outside Antithesis (`ANTITHESIS_STOP_FAULTS` unset) the script no-ops so snouty local validate still works. The script is loaded via `Path(...).read_text()` inside `FaultOrchestrator(Service)`; every `$` is doubled to `$$` before embedding into the compose YAML so docker-compose's parse-time variable interpolation doesn't eat shell references like `${RANDOM}` or `${ANTITHESIS_STOP_FAULTS}`. The on-disk .sh file stays plain bash so shellcheck and direct execution still work. Driver-side: delete `helper_quiet.py` and every `request_quiet_period` call site (9 drivers). Each driver's `wait_for_catchup` timeout (or the equivalent FINAL_READ_TIMEOUT_S in the strict-serializable driver) is bumped to span at least one MAX_OFF window plus catchup overhead — concretely 90s for the short Kafka/MV/upsert drivers, 120s for MySQL CDC paths, and 180s for the singleton rehydration driver which must survive a clusterd kill landing inside a quiet window. Liveness `sometimes(...)` anchor messages were renamed "after quiet period" → "within catchup budget" to match the new semantics; scratchbook docs that quoted the exact strings are updated to match. Regenerate test/antithesis/config/docker-compose.yaml via export-compose.py. Co-Authored-By: Claude Opus 4.7 (1M context) --- test/antithesis/config/docker-compose.yaml | 64 ++++++++++++++++ .../fault-orchestrator/pause_faults.sh | 76 +++++++++++++++++++ test/antithesis/mzcompose.py | 75 ++++++++++++++++++ .../properties/kafka-source-no-data-loss.md | 2 +- .../mysql-myisam-cdc-no-data-loss.md | 2 +- .../properties/mysql-source-no-data-loss.md | 2 +- .../upsert-key-reflects-latest-value.md | 4 +- .../scratchbook/property-catalog.md | 4 +- test/antithesis/workload/test/helper_quiet.py | 38 ---------- .../parallel_driver_kafka_none_envelope.py | 18 +++-- ...rallel_driver_mv_reflects_table_updates.py | 20 ++--- .../test/parallel_driver_mysql_cdc.py | 18 ++--- .../test/parallel_driver_mysql_myisam.py | 17 ++--- .../test/parallel_driver_parallel_workload.py | 20 +++-- ...rallel_driver_strict_serializable_reads.py | 18 +++-- ...llel_driver_upsert_ancient_key_writable.py | 13 ++-- .../parallel_driver_upsert_latest_value.py | 32 ++++---- ...ton_driver_catalog_recovery_consistency.py | 3 +- ...ngleton_driver_upsert_state_rehydration.py | 39 +++++----- 19 files changed, 332 insertions(+), 133 deletions(-) create mode 100755 test/antithesis/fault-orchestrator/pause_faults.sh delete mode 100644 test/antithesis/workload/test/helper_quiet.py diff --git a/test/antithesis/config/docker-compose.yaml b/test/antithesis/config/docker-compose.yaml index 8201cb9a11e7f..ecd6eea161e07 100644 --- a/test/antithesis/config/docker-compose.yaml +++ b/test/antithesis/config/docker-compose.yaml @@ -619,6 +619,70 @@ services: container_name: materialized networks: - antithesis-net + fault-orchestrator: + image: bash:5 + entrypoint: + - bash + - -s + command: + - "#!/usr/bin/env bash\n\n# Copyright Materialize, Inc. and contributors. All\ + \ rights reserved.\n#\n# Use of this software is governed by the Business Source\ + \ License\n# included in the LICENSE file at the root of this repository.\n\ + #\n# As of the Change Date specified in that file, in accordance with\n# the\ + \ Business Source License, use of this software will be governed\n# by the Apache\ + \ License, Version 2.0.\n\n# Drive Antithesis fault windows globally.\n#\n#\ + \ Antithesis injects faults into the system continuously by default.\n# Calling\ + \ `ANTITHESIS_STOP_FAULTS ` requests a quiet window \u2014\n# Antithesis\ + \ pauses fault injection for that many seconds. The Antithesis\n# engagement\ + \ team's recommendation: drive these quiet windows from a\n# single dedicated\ + \ container, not per-driver, otherwise overlapping\n# per-driver requests keep\ + \ the system in a quiet state most of the time\n# and we never actually fault.\n\ + #\n# This script alternates faults-OFF (quiet) and faults-ON (active)\n# windows\ + \ at randomized intervals so each timeline sees a different\n# cadence. Adapted\ + \ from the Antithesis hands-on tutorial:\n# https://github.com/antithesishq/hands-on-tutorial-1/blob/main/python/antithesis/pause_faults.sh\n\ + #\n# Outside Antithesis (snouty local validate) `ANTITHESIS_STOP_FAULTS` is\n\ + # unset; the script exits immediately so the rest of the compose works.\n\n\ + set -euo pipefail\n\nif [[ -z \"$${ANTITHESIS_STOP_FAULTS:-}\" ]]; then\n \ + \ echo \"ANTITHESIS_STOP_FAULTS not set; fault-orchestrator exiting (no-op)\"\ + \n exit 0\nfi\n\n# Tunable via the service `environment:` block. Defaults\ + \ sized so that:\n# * MAX_ON is comfortably shorter than any driver's CATCHUP_TIMEOUT_S\n\ + # (smallest is 60s in parallel_driver_upsert_latest_value) \u2014 a\n# \ + \ driver's catchup window can always span at least one full quiet\n# \ + \ period.\n# * MIN_OFF is long enough for materialized to commit a few timestamps\n\ + # and for sources to advance offset_committed past the most recent\n# \ + \ batch of produced offsets.\n# * START_DELAY gives setup-complete + bootstrap\ + \ a window of un-faulted\n# time before the alternation begins.\nSTART_DELAY=\"\ + $${START_DELAY:-30}\"\nMIN_ON=\"$${MIN_ON:-20}\"\nMAX_ON=\"$${MAX_ON:-40}\"\n\ + MIN_OFF=\"$${MIN_OFF:-20}\"\nMAX_OFF=\"$${MAX_OFF:-40}\"\n\necho \"fault-orchestrator:\ + \ ON $${MIN_ON}-$${MAX_ON}s / OFF $${MIN_OFF}-$${MAX_OFF}s, initial pause $${START_DELAY}s\"\ + \n\n# Initial quiet window so the rest of the stack reaches steady state\n#\ + \ before Antithesis starts faulting. Antithesis may or may not honour\n# this\ + \ depending on when fault injection begins relative to setup-\n# complete; either\ + \ way the local sleep gives drivers a clean start.\n\"$${ANTITHESIS_STOP_FAULTS}\"\ + \ \"$${START_DELAY}\"\nsleep \"$${START_DELAY}\"\n\nwhile true; do\n # Re-seed\ + \ $$RANDOM from /dev/urandom so successive iterations don't\n # repeat the\ + \ same on/off period (the shell's RANDOM is a 16-bit LCG;\n # without reseeding\ + \ it can produce predictable sequences).\n RANDOM=$$(od -An -N2 -tu2 /dev/urandom\ + \ | tr -d ' ')\n ON_PERIOD=$$((MIN_ON + (RANDOM % (MAX_ON - MIN_ON + 1))))\n\ + \ OFF_PERIOD=$$((MIN_OFF + (RANDOM % (MAX_OFF - MIN_OFF + 1))))\n\n echo\ + \ \"fault-orchestrator: faults OFF for $${OFF_PERIOD}s\"\n \"$${ANTITHESIS_STOP_FAULTS}\"\ + \ \"$${OFF_PERIOD}\"\n sleep \"$${OFF_PERIOD}\"\n\n echo \"fault-orchestrator:\ + \ faults ON for $${ON_PERIOD}s\"\n sleep \"$${ON_PERIOD}\"\ndone\n" + environment: + - START_DELAY=30 + - MIN_ON=20 + - MAX_ON=40 + - MIN_OFF=20 + - MAX_OFF=40 + depends_on: + materialized: + condition: service_healthy + restart: 'no' + platform: linux/amd64 + container_name: fault-orchestrator + hostname: fault-orchestrator + networks: + - antithesis-net workload: depends_on: materialized: diff --git a/test/antithesis/fault-orchestrator/pause_faults.sh b/test/antithesis/fault-orchestrator/pause_faults.sh new file mode 100755 index 0000000000000..00cb4e910bc47 --- /dev/null +++ b/test/antithesis/fault-orchestrator/pause_faults.sh @@ -0,0 +1,76 @@ +#!/usr/bin/env bash + +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +# Drive Antithesis fault windows globally. +# +# Antithesis injects faults into the system continuously by default. +# Calling `ANTITHESIS_STOP_FAULTS ` requests a quiet window — +# Antithesis pauses fault injection for that many seconds. The Antithesis +# engagement team's recommendation: drive these quiet windows from a +# single dedicated container, not per-driver, otherwise overlapping +# per-driver requests keep the system in a quiet state most of the time +# and we never actually fault. +# +# This script alternates faults-OFF (quiet) and faults-ON (active) +# windows at randomized intervals so each timeline sees a different +# cadence. Adapted from the Antithesis hands-on tutorial: +# https://github.com/antithesishq/hands-on-tutorial-1/blob/main/python/antithesis/pause_faults.sh +# +# Outside Antithesis (snouty local validate) `ANTITHESIS_STOP_FAULTS` is +# unset; the script exits immediately so the rest of the compose works. + +set -euo pipefail + +if [[ -z "${ANTITHESIS_STOP_FAULTS:-}" ]]; then + echo "ANTITHESIS_STOP_FAULTS not set; fault-orchestrator exiting (no-op)" + exit 0 +fi + +# Tunable via the service `environment:` block. Defaults sized so that: +# * MAX_ON is comfortably shorter than any driver's CATCHUP_TIMEOUT_S +# (smallest is 60s in parallel_driver_upsert_latest_value) — a +# driver's catchup window can always span at least one full quiet +# period. +# * MIN_OFF is long enough for materialized to commit a few timestamps +# and for sources to advance offset_committed past the most recent +# batch of produced offsets. +# * START_DELAY gives setup-complete + bootstrap a window of un-faulted +# time before the alternation begins. +START_DELAY="${START_DELAY:-30}" +MIN_ON="${MIN_ON:-20}" +MAX_ON="${MAX_ON:-40}" +MIN_OFF="${MIN_OFF:-20}" +MAX_OFF="${MAX_OFF:-40}" + +echo "fault-orchestrator: ON ${MIN_ON}-${MAX_ON}s / OFF ${MIN_OFF}-${MAX_OFF}s, initial pause ${START_DELAY}s" + +# Initial quiet window so the rest of the stack reaches steady state +# before Antithesis starts faulting. Antithesis may or may not honour +# this depending on when fault injection begins relative to setup- +# complete; either way the local sleep gives drivers a clean start. +"${ANTITHESIS_STOP_FAULTS}" "${START_DELAY}" +sleep "${START_DELAY}" + +while true; do + # Re-seed $RANDOM from /dev/urandom so successive iterations don't + # repeat the same on/off period (the shell's RANDOM is a 16-bit LCG; + # without reseeding it can produce predictable sequences). + RANDOM=$(od -An -N2 -tu2 /dev/urandom | tr -d ' ') + ON_PERIOD=$((MIN_ON + (RANDOM % (MAX_ON - MIN_ON + 1)))) + OFF_PERIOD=$((MIN_OFF + (RANDOM % (MAX_OFF - MIN_OFF + 1)))) + + echo "fault-orchestrator: faults OFF for ${OFF_PERIOD}s" + "${ANTITHESIS_STOP_FAULTS}" "${OFF_PERIOD}" + sleep "${OFF_PERIOD}" + + echo "fault-orchestrator: faults ON for ${ON_PERIOD}s" + sleep "${ON_PERIOD}" +done diff --git a/test/antithesis/mzcompose.py b/test/antithesis/mzcompose.py index bfbc1abd6d4ca..0f333afeee8e4 100644 --- a/test/antithesis/mzcompose.py +++ b/test/antithesis/mzcompose.py @@ -32,6 +32,13 @@ from the harness; defaults to 8). - materialized : the SUT (environmentd; clusterd is external) - workload : Python test driver wired to the Antithesis SDK + - fault-orchestrator : single bash container alternating quiet and + faulting windows globally via + `ANTITHESIS_STOP_FAULTS`. Centralising the + cadence avoids the failure mode where every + driver requests its own quiet window and the + union of overlapping requests keeps the system + in a quiet state most of the time. Usage: bin/mzcompose --find antithesis run default # bring up the cluster @@ -39,6 +46,7 @@ """ import os +from pathlib import Path from materialize.mzcompose.composition import Composition from materialize.mzcompose.service import Service, ServiceConfig @@ -75,6 +83,71 @@ CLUSTERD_WORKERS = 16 +class FaultOrchestrator(Service): + """Single bash container that drives Antithesis fault windows globally. + + Invokes `${ANTITHESIS_STOP_FAULTS} ` to open quiet windows, + then sleeps through faults-ON windows, on a randomised cadence + (MIN_ON..MAX_ON / MIN_OFF..MAX_OFF). The script is bundled in + `test/antithesis/fault-orchestrator/pause_faults.sh` and inlined into + the compose `command:` here so we don't need a new mzbuild image + just to ship 30 lines of bash. + + The Antithesis engagement team flagged per-driver quiet-period + requests as an anti-pattern: with many concurrent drivers each + asking for a quiet window, the union of overlapping windows leaves + the SUT mostly un-faulted. Centralising the cadence here means + faults arrive in one coordinated rhythm; drivers stay robust to + quiet/faulting transitions by relying on `wait_for_catchup` with + generous timeouts. + + Outside Antithesis `ANTITHESIS_STOP_FAULTS` is unset and the script + exits immediately, so this service is a no-op for local validate. + """ + + def __init__(self) -> None: + script_path = Path(__file__).parent / "fault-orchestrator" / "pause_faults.sh" + # Compose interpolates `${VAR}` in every string value at parse + # time, which would eat the script's shell variable references + # (`${RANDOM}`, `${MIN_ON}`, `${ANTITHESIS_STOP_FAULTS}`, etc.) + # before bash ever sees them. Double the `$` to pass through a + # literal `$` and let bash do its own expansion at runtime. The + # underlying .sh file stays normal so shellcheck and direct + # execution work. + script = script_path.read_text().replace("$", "$$") + config: ServiceConfig = { + # bash:5 is alpine-based and ships `bash`, `od`, `tr`, and + # `sleep` via busybox — everything the script uses. Public + # image, so it sails through export-compose.py untouched. + "image": "bash:5", + # `bash -s` reads the script from stdin via a here-string; + # keeps the YAML readable instead of one giant `-c` blob. + "entrypoint": ["bash", "-s"], + "command": [script], + "environment": [ + # Defaults chosen so MAX_ON stays well under the smallest + # driver's CATCHUP_TIMEOUT_S (currently 90s) — every + # driver lifetime has a chance to span at least one quiet + # window. + "START_DELAY=30", + "MIN_ON=20", + "MAX_ON=40", + "MIN_OFF=20", + "MAX_OFF=40", + ], + # Wait for materialized so the orchestrator's first + # ANTITHESIS_STOP_FAULTS call doesn't precede the SUT being + # ready. Timing is not safety-critical: Antithesis only + # starts injecting faults after setup-complete fires from + # the workload container. + "depends_on": { + "materialized": {"condition": "service_healthy"}, + }, + "restart": "no", + } + super().__init__(name="fault-orchestrator", config=config) + + class Workload(Service): """Antithesis workload client — Python test driver.""" @@ -233,6 +306,7 @@ def __init__(self) -> None: "unsafe_enable_unorchestrated_cluster_replicas": "true", }, ), + FaultOrchestrator(), Workload(), ] @@ -253,4 +327,5 @@ def workflow_default(c: Composition) -> None: "mysql-replica", ) c.up("materialized") + c.up("fault-orchestrator") c.up("workload") diff --git a/test/antithesis/scratchbook/properties/kafka-source-no-data-loss.md b/test/antithesis/scratchbook/properties/kafka-source-no-data-loss.md index e999c42b76083..af21f23e6665b 100644 --- a/test/antithesis/scratchbook/properties/kafka-source-no-data-loss.md +++ b/test/antithesis/scratchbook/properties/kafka-source-no-data-loss.md @@ -43,7 +43,7 @@ Implemented 2026-05-11 (NONE envelope, workload-side) as `test/antithesis/worklo | Message | Type | Fires when | |---------|------|------------| -| `"kafka source caught up to produced offsets after quiet period (none envelope)"` | `sometimes` | Once per invocation after `wait_for_catchup`; the liveness anchor | +| `"kafka source caught up to produced offsets within catchup budget (none envelope)"` | `sometimes` | Once per invocation after `wait_for_catchup`; the liveness anchor | | `"kafka source: every produced payload is visible exactly once"` | `always` | Per produced payload, after catchup; carries `payload`, `present`, `observed_count` in details | The UPSERT-envelope arm of this property is covered by `upsert-key-reflects-latest-value`. diff --git a/test/antithesis/scratchbook/properties/mysql-myisam-cdc-no-data-loss.md b/test/antithesis/scratchbook/properties/mysql-myisam-cdc-no-data-loss.md index 5e5b6fd239f0e..c84150c26cde9 100644 --- a/test/antithesis/scratchbook/properties/mysql-myisam-cdc-no-data-loss.md +++ b/test/antithesis/scratchbook/properties/mysql-myisam-cdc-no-data-loss.md @@ -72,7 +72,7 @@ Implemented as `test/antithesis/workload/test/parallel_driver_mysql_myisam.py`. |---------|------|------------| | `"mysql myisam: CDC source row has correct value after catchup"` | `always` | Per row, after catchup. False ⟺ row missing or value wrong. | | `"mysql myisam: CDC source row count matches inserted count after catchup"` | `always` | Per invocation, after catchup. False ⟺ extra or missing rows for this batch. | -| `"mysql myisam: CDC source caught up to all primary inserts after quiet period"` | `sometimes` | Per invocation. Liveness for the catchup gate. | +| `"mysql myisam: CDC source caught up to all primary inserts within catchup budget"` | `sometimes` | Per invocation. Liveness for the catchup gate. | | `"mysql replica: both cdc_test tables replicated from primary within 90s"` | `sometimes` | Per timeline (fires once from `first_mysql_replica_setup`). Confirms replication is flowing for both engines. | Knobs: `ROWS_PER_INVOCATION=20`, `QUIET_PERIOD_S=25`, `CATCHUP_TIMEOUT_S=90.0`. diff --git a/test/antithesis/scratchbook/properties/mysql-source-no-data-loss.md b/test/antithesis/scratchbook/properties/mysql-source-no-data-loss.md index 19f6d02d68974..dd707894f61bf 100644 --- a/test/antithesis/scratchbook/properties/mysql-source-no-data-loss.md +++ b/test/antithesis/scratchbook/properties/mysql-source-no-data-loss.md @@ -26,7 +26,7 @@ Each `parallel_driver_` invocation: 3. Requests an Antithesis quiet period (25 s) and polls `antithesis_cdc` in Materialize until all expected rows appear or the 90 s budget expires. 4. Fires: - - `sometimes("mysql: CDC source caught up to all primary inserts after quiet period", …)` + - `sometimes("mysql: CDC source caught up to all primary inserts within catchup budget", …)` — liveness anchor; confirms at least one invocation reaches full catchup. - `always("mysql: CDC source row has correct value after catchup", …)` — safety; fired once per row, catches wrong-value corruption. diff --git a/test/antithesis/scratchbook/properties/upsert-key-reflects-latest-value.md b/test/antithesis/scratchbook/properties/upsert-key-reflects-latest-value.md index 90341358df926..d504357e685ee 100644 --- a/test/antithesis/scratchbook/properties/upsert-key-reflects-latest-value.md +++ b/test/antithesis/scratchbook/properties/upsert-key-reflects-latest-value.md @@ -52,9 +52,9 @@ Implemented 2026-05-11 as `test/antithesis/workload/test/parallel_driver_upsert_ |---------|------|------| | `"upsert: SELECT for key matches latest produced value"` | `always` | Per sampled live key after quiet-period catchup | | `"upsert: tombstoned key has no row in source"` | `always` | Per sampled key whose last produced message was a tombstone | -| `"upsert: source caught up to produced offsets after quiet period"` | `sometimes` | Once per invocation; liveness anchor proving the safety assertions ran against settled data | +| `"upsert: source caught up to produced offsets within catchup budget"` | `sometimes` | Once per invocation; liveness anchor proving the safety assertions ran against settled data | -Shared helpers introduced for this property and reusable by every subsequent Kafka source property: `helper_pg.py` (resilient pgwire), `helper_kafka.py` (producer + delivery tracker), `helper_quiet.py` (`ANTITHESIS_STOP_FAULTS` wrapper), `helper_random.py` (deterministic randomness with Antithesis SDK), `helper_source_stats.py` (catchup polling on `mz_internal.mz_source_statistics`), `helper_upsert_source.py` (idempotent `CREATE CONNECTION` + `CREATE SOURCE`). +Shared helpers introduced for this property and reusable by every subsequent Kafka source property: `helper_pg.py` (resilient pgwire), `helper_kafka.py` (producer + delivery tracker), `helper_random.py` (Antithesis SDK randomness, including an `AntithesisRandom` subclass for code expecting a `random.Random`), `helper_source_stats.py` (catchup polling on `mz_internal.mz_source_statistics`), `helper_upsert_source.py` (idempotent `CREATE CONNECTION` + `CREATE SOURCE`). Quiet windows are driven globally by a `fault-orchestrator` service (alternating randomized faults-ON / faults-OFF intervals); drivers no longer call `ANTITHESIS_STOP_FAULTS` themselves and rely on `wait_for_catchup` with a budget sized to span one quiet window. No SUT-side instrumentation added in this pass — that is the candidate work in `properties/upsert-no-internal-panic.md`, `properties/upsert-state-consolidation-wellformed.md`, and `properties/upsert-ensure-decoded-called-before-access.md`. diff --git a/test/antithesis/scratchbook/property-catalog.md b/test/antithesis/scratchbook/property-catalog.md index ec139c3ea4ae9..067a63f755e8c 100644 --- a/test/antithesis/scratchbook/property-catalog.md +++ b/test/antithesis/scratchbook/property-catalog.md @@ -294,7 +294,7 @@ Properties specific to the Kafka source ingestion pipeline: `KafkaSourceReader` |---|---| | **Type** | Safety | | **Priority** | P0 — the entire user-visible promise of the UPSERT envelope | -| **Status** | **Implemented** (workload-side) — `test/antithesis/workload/test/parallel_driver_upsert_latest_value.py`. Two `always()` assertions ("upsert: SELECT for key matches latest produced value", "upsert: tombstoned key has no row in source") plus one `sometimes()` liveness anchor ("upsert: source caught up to produced offsets after quiet period"). | +| **Status** | **Implemented** (workload-side) — `test/antithesis/workload/test/parallel_driver_upsert_latest_value.py`. Two `always()` assertions ("upsert: SELECT for key matches latest produced value", "upsert: tombstoned key has no row in source") plus one `sometimes()` liveness anchor ("upsert: source caught up to produced offsets within catchup budget"). | | **Property** | At a settled timestamp, for each key produced by the workload, the UPSERT source contains exactly the value from the last `(key, value)` message produced — or no row if the last message for that key was a tombstone. | | **Invariant** | `Always`: for every workload-tracked key, `SELECT value FROM source WHERE key = ?` returns the expected value (or empty for tombstoned keys), as determined by the workload's local model of what it produced. Checked after `ANTITHESIS_STOP_FAULTS` quiet periods. | | **Antithesis Angle** | Reorder produce timing, kill clusterd between the prior-value lookup (`multi_get`) and the new-value write (`multi_put`), inject delays in the feedback-driven snapshot phase. Tests order-key monotonicity (commit f177db8286), state-backend consistency, and snapshot-completion correctness. | @@ -437,7 +437,7 @@ commit-order preservation) to the Antithesis environment. |---|---| | **Type** | Liveness + Safety | | **Priority** | P1 — end-to-end correctness of the MySQL CDC pipeline; tests a distinct code path from Kafka | -| **Status** | **Implemented (workload-side)** — `test/antithesis/workload/test/parallel_driver_mysql_cdc.py` + `first_mysql_replica_setup.py`. Each `parallel_driver_` invocation inserts 20 rows to MySQL primary, waits for a quiet period, then polls `antithesis_cdc` until all rows appear (or 90 s budget expires). `always("mysql: CDC source row has correct value after catchup", …)` and `always("mysql: CDC source row count matches inserted count after catchup", …)` fire per-row and per-batch after confirmed catchup; `sometimes("mysql: CDC source caught up to all primary inserts after quiet period", …)` is the liveness anchor. The `first_mysql_replica_setup.py` creates the MySQL schema, configures multithreaded replication (4 workers, `replica_preserve_commit_order=ON`), and creates the Materialize connection/source/table, firing `reachable("mysql: first-run setup complete …")` as a coverage anchor. | +| **Status** | **Implemented (workload-side)** — `test/antithesis/workload/test/parallel_driver_mysql_cdc.py` + `first_mysql_replica_setup.py`. Each `parallel_driver_` invocation inserts 20 rows to MySQL primary, waits for a quiet period, then polls `antithesis_cdc` until all rows appear (or 90 s budget expires). `always("mysql: CDC source row has correct value after catchup", …)` and `always("mysql: CDC source row count matches inserted count after catchup", …)` fire per-row and per-batch after confirmed catchup; `sometimes("mysql: CDC source caught up to all primary inserts within catchup budget", …)` is the liveness anchor. The `first_mysql_replica_setup.py` creates the MySQL schema, configures multithreaded replication (4 workers, `replica_preserve_commit_order=ON`), and creates the Materialize connection/source/table, firing `reachable("mysql: first-run setup complete …")` as a coverage anchor. | | **Property** | After inserting a row to the MySQL primary (via the binlog + GTID-based multithreaded replica), the Materialize CDC source eventually contains that row with the correct value. | | **Invariant** | `Always`: after catchup, for every row inserted to `antithesis.cdc_test` on the primary, `SELECT value FROM antithesis_cdc WHERE id = ?` returns the expected value. `Sometimes`: catchup completes within the quiet-period budget at least once per run. | | **Antithesis Angle** | Kills to the MySQL replica container (replica restarts from persisted GTID position); kills to the MySQL primary (replica and Materialize source must handle upstream silence gracefully); clusterd restarts (MySQL CDC resume exercises the same `storage-command-replay-idempotent` path as Kafka); parallel worker scheduling jitter that stresses the `replica_preserve_commit_order` protocol. | diff --git a/test/antithesis/workload/test/helper_quiet.py b/test/antithesis/workload/test/helper_quiet.py deleted file mode 100644 index adb4f9ead3e6d..0000000000000 --- a/test/antithesis/workload/test/helper_quiet.py +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright Materialize, Inc. and contributors. All rights reserved. -# -# Use of this software is governed by the Business Source License -# included in the LICENSE file at the root of this repository. -# -# As of the Change Date specified in that file, in accordance with -# the Business Source License, use of this software will be governed -# by the Apache License, Version 2.0. - -"""Wrapper around the Antithesis ANTITHESIS_STOP_FAULTS binary. - -Outside Antithesis (e.g. snouty local validate), the env var is unset and this -becomes a no-op so the workload still runs end-to-end. -""" - -from __future__ import annotations - -import logging -import os -import subprocess - -LOG = logging.getLogger("antithesis.helper_quiet") - - -def request_quiet_period(seconds: int) -> bool: - """Request that Antithesis pause all faults for `seconds`. - - Returns True if the request was issued, False if not in Antithesis. Either - way callers must still poll for the system to stabilize — the binary - returns immediately and the actual quiet window unfolds asynchronously. - """ - binary = os.environ.get("ANTITHESIS_STOP_FAULTS") - if not binary: - LOG.info("ANTITHESIS_STOP_FAULTS not set; skipping quiet-period request") - return False - LOG.info("requesting %ds quiet period via %s", seconds, binary) - subprocess.run([binary, str(seconds)], check=False) - return True diff --git a/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py b/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py index bbb4e2529eca8..656a6b6b6d776 100755 --- a/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py +++ b/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py @@ -24,8 +24,10 @@ workload can filter the source down to its own rows when asserting. 3. Produces N distinct payloads, recording the broker-assigned `(partition, offset)` for each via the delivery callback. - 4. Requests an Antithesis quiet period and waits for `offset_committed` - to reach the highest produced offset. + 4. Waits for `offset_committed` to reach the highest produced offset. + The global fault-orchestrator service drives quiet/active windows + on its own cadence; the catchup timeout is sized to span at least + one quiet window so the source can advance during it. 5. Runs two `assert_always` checks: - "kafka source: no duplicate (partition, offset)" — `GROUP BY 1, 2 HAVING COUNT(*) > 1` is empty - "kafka source: every produced payload is visible exactly once" — @@ -45,7 +47,6 @@ import sys import helper_random -from antithesis.assertions import always, sometimes from helper_kafka import make_producer from helper_none_source import ( SOURCE_NONE_TEXT, @@ -53,9 +54,10 @@ ensure_none_text_source, ) from helper_pg import query_retry -from helper_quiet import request_quiet_period from helper_source_stats import wait_for_catchup +from antithesis.assertions import always, sometimes + logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" ) @@ -65,8 +67,9 @@ # — Antithesis launches the driver many times and accumulates coverage # across invocations, not within one giant batch. PRODUCES_PER_INVOCATION = 50 -QUIET_PERIOD_S = 20 -CATCHUP_TIMEOUT_S = 60.0 +# Sized to span at least one MAX_OFF window from the global fault- +# orchestrator (default 40s) plus enough buffer for catchup itself. +CATCHUP_TIMEOUT_S = 90.0 def main() -> int: @@ -114,14 +117,13 @@ def main() -> int: # source query below joins payloads back to (partition, offset) # assignments without us needing to track them at produce time. - request_quiet_period(QUIET_PERIOD_S) caught_up = wait_for_catchup( SOURCE_NONE_TEXT, max_produced, timeout_s=CATCHUP_TIMEOUT_S ) sometimes( caught_up, - "kafka source caught up to produced offsets after quiet period (none envelope)", + "kafka source caught up to produced offsets within catchup budget (none envelope)", {"source": SOURCE_NONE_TEXT, "target_offset": max_produced}, ) diff --git a/test/antithesis/workload/test/parallel_driver_mv_reflects_table_updates.py b/test/antithesis/workload/test/parallel_driver_mv_reflects_table_updates.py index 876f5ff5a8e5e..635efe79fac69 100755 --- a/test/antithesis/workload/test/parallel_driver_mv_reflects_table_updates.py +++ b/test/antithesis/workload/test/parallel_driver_mv_reflects_table_updates.py @@ -25,8 +25,10 @@ 2. Picks a per-invocation prefix so concurrent driver instances scope to disjoint MV rows. 3. INSERTs N rows tagged with the prefix. - 4. Requests an Antithesis quiet period and polls the MV until the count - for the prefix equals N. + 4. Polls the MV until the count for the prefix equals N. The global + fault-orchestrator service drives quiet/active windows on its own + cadence; this driver's catchup timeout is sized to span at least + one quiet window so the read can complete during it. 5. Asserts: - `always(...)` the MV count matches what was inserted (no over- or under-counting after settle). @@ -45,19 +47,21 @@ import time import helper_random -from antithesis.assertions import always, sometimes from helper_pg import execute_retry, query_one_retry -from helper_quiet import request_quiet_period from helper_table_mv import MV_NAME, TABLE_MV_INPUT, ensure_table_and_mv +from antithesis.assertions import always, sometimes + logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" ) LOG = logging.getLogger("driver.mv_reflects_table_updates") INSERTS_PER_INVOCATION = 40 -QUIET_PERIOD_S = 20 -CATCHUP_TIMEOUT_S = 60.0 +# Sized to span at least one MAX_OFF window from the global fault- +# orchestrator (default 40s) plus enough buffer for the MV catchup itself +# during that window. +CATCHUP_TIMEOUT_S = 90.0 CATCHUP_POLL_INTERVAL_S = 0.5 @@ -97,8 +101,6 @@ def main() -> int: params, ) - request_quiet_period(QUIET_PERIOD_S) - # Poll the MV until the row_count for this prefix reaches N. The MV's # `COUNT(*) GROUP BY prefix` shape means the row for this prefix may # appear partially populated during the catchup window. @@ -112,7 +114,7 @@ def main() -> int: sometimes( caught_up, - "mv: row_count caught up to inserted count after quiet period", + "mv: row_count caught up to inserted count within catchup budget", { "mv": MV_NAME, "table": TABLE_MV_INPUT, diff --git a/test/antithesis/workload/test/parallel_driver_mysql_cdc.py b/test/antithesis/workload/test/parallel_driver_mysql_cdc.py index 233207ff8e3c6..4537985fe64ca 100644 --- a/test/antithesis/workload/test/parallel_driver_mysql_cdc.py +++ b/test/antithesis/workload/test/parallel_driver_mysql_cdc.py @@ -20,7 +20,7 @@ collide. 3. Inserts ROWS_PER_INVOCATION rows to the MySQL primary, recording the expected {id → value} map locally. - 4. Requests an Antithesis quiet period and polls the Materialize source + 4. Polls the Materialize source table until all expected rows appear (or the budget expires). 5. Asserts correctness via `always(...)` on count and per-row values. A `sometimes(...)` liveness anchor fires on successful catchup. @@ -39,10 +39,10 @@ import helper_mysql import helper_random -from antithesis.assertions import always, sometimes from helper_mysql_source import SOURCE_NAME, TABLE_NAME from helper_pg import query_retry -from helper_quiet import request_quiet_period + +from antithesis.assertions import always, sometimes logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" @@ -50,8 +50,10 @@ LOG = logging.getLogger("driver.mysql_cdc") ROWS_PER_INVOCATION = 20 -QUIET_PERIOD_S = 25 -CATCHUP_TIMEOUT_S = 90.0 +# Sized to span at least one MAX_OFF window from the global fault- +# orchestrator (default 40s) plus the time for replica → source → MZ +# catchup itself, which can stretch under intermittent network faults. +CATCHUP_TIMEOUT_S = 120.0 POLL_INTERVAL_S = 1.0 @@ -174,16 +176,14 @@ def main() -> int: LOG.info("no rows inserted successfully this invocation; exiting cleanly") return 0 - LOG.info("inserted %d rows; requesting quiet period", len(expected)) - request_quiet_period(QUIET_PERIOD_S) - + LOG.info("inserted %d rows; waiting for catchup", len(expected)) caught_up = _wait_for_catchup(batch_id, len(expected)) # Liveness anchor: at least one invocation should fully catch up. If this # never fires across an entire run the safety assertions below are vacuous. sometimes( caught_up, - "mysql: CDC source caught up to all primary inserts after quiet period", + "mysql: CDC source caught up to all primary inserts within catchup budget", { "source": TABLE_NAME, "batch_id": batch_id, diff --git a/test/antithesis/workload/test/parallel_driver_mysql_myisam.py b/test/antithesis/workload/test/parallel_driver_mysql_myisam.py index 00542bba536bc..a7075a276c365 100644 --- a/test/antithesis/workload/test/parallel_driver_mysql_myisam.py +++ b/test/antithesis/workload/test/parallel_driver_mysql_myisam.py @@ -40,7 +40,7 @@ 2. Picks a per-invocation `batch_id` prefix so concurrent drivers (including the InnoDB sibling) don't collide. 3. Inserts ROWS_PER_INVOCATION rows to the MyISAM table on the primary. - 4. Requests an Antithesis quiet period and polls the Materialize source + 4. Polls the Materialize source table until all expected rows appear (or the budget expires). 5. Asserts correctness via `always(...)` on count and per-row values. """ @@ -53,7 +53,6 @@ import helper_mysql import helper_random -from antithesis.assertions import always, sometimes from helper_mysql_source import ( MYSQL_DATABASE, MYSQL_TABLE_MYISAM, @@ -61,7 +60,8 @@ TABLE_NAME_MYISAM, ) from helper_pg import query_retry -from helper_quiet import request_quiet_period + +from antithesis.assertions import always, sometimes logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" @@ -69,8 +69,9 @@ LOG = logging.getLogger("driver.mysql_myisam") ROWS_PER_INVOCATION = 20 -QUIET_PERIOD_S = 25 -CATCHUP_TIMEOUT_S = 90.0 +# Sized to span at least one MAX_OFF window from the global fault- +# orchestrator (default 40s) plus replica → source → MZ catchup time. +CATCHUP_TIMEOUT_S = 120.0 POLL_INTERVAL_S = 1.0 @@ -199,14 +200,12 @@ def main() -> int: LOG.info("no rows inserted successfully this invocation; exiting cleanly") return 0 - LOG.info("inserted %d rows; requesting quiet period", len(expected)) - request_quiet_period(QUIET_PERIOD_S) - + LOG.info("inserted %d rows; waiting for catchup", len(expected)) caught_up = _wait_for_catchup(batch_id, len(expected)) sometimes( caught_up, - "mysql myisam: CDC source caught up to all primary inserts after quiet period", + "mysql myisam: CDC source caught up to all primary inserts within catchup budget", { "source": TABLE_NAME_MYISAM, "batch_id": batch_id, diff --git a/test/antithesis/workload/test/parallel_driver_parallel_workload.py b/test/antithesis/workload/test/parallel_driver_parallel_workload.py index 945fd0805e515..3e698eeb1e660 100644 --- a/test/antithesis/workload/test/parallel_driver_parallel_workload.py +++ b/test/antithesis/workload/test/parallel_driver_parallel_workload.py @@ -47,7 +47,6 @@ import helper_random import psycopg -from antithesis.assertions import always, sometimes from helper_pg import ( PGDATABASE, PGHOST, @@ -57,6 +56,7 @@ PGUSER_INTERNAL, ) +from antithesis.assertions import always, sometimes from materialize.data_ingest.query_error import QueryError from materialize.parallel_workload import executor as _pw_executor from materialize.parallel_workload.action import ( @@ -258,9 +258,7 @@ def _matches_setup_tolerance(exc: BaseException) -> bool: signal). """ msg = getattr(exc, "msg", None) or str(exc) - return any( - pat in msg for pat in (*_SETUP_RACE_PATTERNS, *_SETUP_FAULT_PATTERNS) - ) + return any(pat in msg for pat in (*_SETUP_RACE_PATTERNS, *_SETUP_FAULT_PATTERNS)) def _worker_death_tolerable(occurred: Exception | None) -> bool: @@ -412,7 +410,9 @@ def _create_database_for_antithesis(database: Database, exe: Executor) -> None: "CREATE CONNECTION IF NOT EXISTS csr_conn FOR CONFLUENT SCHEMA " "REGISTRY URL 'http://schema-registry:8081'", ) - _tolerate_setup_race(exe.execute, "CREATE SECRET IF NOT EXISTS minio AS 'minioadmin'") + _tolerate_setup_race( + exe.execute, "CREATE SECRET IF NOT EXISTS minio AS 'minioadmin'" + ) _tolerate_setup_race( exe.execute, "CREATE CONNECTION IF NOT EXISTS aws_conn TO AWS (" @@ -596,7 +596,11 @@ def _run_invocation( dead = [t for t in threads if not t.is_alive()] if dead: occurred = next( - (w.occurred_exception for w in workers if w.occurred_exception), + ( + w.occurred_exception + for w in workers + if w.occurred_exception + ), None, ) worker_failed = WorkerFailedException( @@ -667,7 +671,9 @@ def _run_invocation( "parallel workload: worker thread death tolerated as fault-injection consequence", { "error": ( - str(worker_failed.cause) if worker_failed and worker_failed.cause else None + str(worker_failed.cause) + if worker_failed and worker_failed.cause + else None ), "uncaptured": worker_failed is not None and worker_failed.cause is None, }, diff --git a/test/antithesis/workload/test/parallel_driver_strict_serializable_reads.py b/test/antithesis/workload/test/parallel_driver_strict_serializable_reads.py index 19e7d1d698dbc..71cb339149018 100755 --- a/test/antithesis/workload/test/parallel_driver_strict_serializable_reads.py +++ b/test/antithesis/workload/test/parallel_driver_strict_serializable_reads.py @@ -37,7 +37,9 @@ invariant. - `always(final >= max(count), …)` for the closing observation. - `sometimes(...)` liveness anchor confirming the closing - observation reached the inserted count after the quiet period. + observation reached the inserted count within the final-read + budget (which is sized to span at least one quiet window from + the global fault-orchestrator). Read failures (connect timeout, server unavailable mid-fault) are skipped rather than recorded — they are not regression evidence, and a False @@ -57,7 +59,6 @@ import helper_random import psycopg -from antithesis.assertions import always, sometimes from helper_pg import ( PGDATABASE, PGHOST, @@ -65,17 +66,20 @@ PGUSER, execute_retry, ) -from helper_quiet import request_quiet_period from helper_table_mv import MV_NAME, TABLE_MV_INPUT, ensure_table_and_mv +from antithesis.assertions import always, sometimes + logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" ) LOG = logging.getLogger("driver.strict_serializable_reads") STEPS_PER_INVOCATION = 12 -QUIET_PERIOD_S = 15 -FINAL_READ_TIMEOUT_S = 30.0 +# Sized to span at least one MAX_OFF window from the global fault- +# orchestrator (default 40s) plus the time the final read needs after +# the MV catches up. +FINAL_READ_TIMEOUT_S = 90.0 FINAL_READ_POLL_S = 0.5 PROBE_CONNECT_TIMEOUT_S = 5 @@ -150,8 +154,8 @@ def main() -> int: observations.append((step, observed)) # Settle and take the closing observation. The driver is short and the - # observations list is small, so a generous timeout here is fine. - request_quiet_period(QUIET_PERIOD_S) + # observations list is small, so a generous timeout here is fine — long + # enough to span at least one global-orchestrator quiet window. expected_final = len(observations) and observations[-1][0] # `expected_final` is the largest step that was actually INSERTed (we # may have bailed early). It's an *upper bound* on the count — the diff --git a/test/antithesis/workload/test/parallel_driver_upsert_ancient_key_writable.py b/test/antithesis/workload/test/parallel_driver_upsert_ancient_key_writable.py index 296bf115fd425..84a9a47369a50 100644 --- a/test/antithesis/workload/test/parallel_driver_upsert_ancient_key_writable.py +++ b/test/antithesis/workload/test/parallel_driver_upsert_ancient_key_writable.py @@ -59,10 +59,8 @@ import sys import helper_random -from antithesis.assertions import always, sometimes from helper_kafka import make_producer from helper_pg import query_retry -from helper_quiet import request_quiet_period from helper_source_stats import wait_for_catchup from helper_upsert_source import ( SOURCE_UPSERT_TEXT, @@ -70,6 +68,8 @@ ensure_upsert_text_source, ) +from antithesis.assertions import always, sometimes + logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" ) @@ -89,8 +89,10 @@ # many short invocations rather than one big one. ANCIENT_KEYS_PER_INVOCATION = 5 -QUIET_PERIOD_S = 20 -CATCHUP_TIMEOUT_S = 60.0 +# Sized to span at least one MAX_OFF window from the global fault- +# orchestrator (default 40s) plus the time the upsert source needs to +# advance offset_committed past our produces. +CATCHUP_TIMEOUT_S = 90.0 def _produce(producer, tracker, topic: str, key: str, value: str) -> None: @@ -188,13 +190,12 @@ def main() -> int: LOG.info("no produces confirmed; exiting cleanly") return 0 - request_quiet_period(QUIET_PERIOD_S) caught_up = wait_for_catchup( SOURCE_UPSERT_TEXT, max_produced, timeout_s=CATCHUP_TIMEOUT_S ) sometimes( caught_up, - "upsert: source caught up after cross-invocation produces", + "upsert: source caught up after cross-invocation produces within catchup budget", {"source": SOURCE_UPSERT_TEXT, "target_offset": max_produced}, ) if not caught_up: diff --git a/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py b/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py index 68734c7f03c82..7e3032258dee5 100755 --- a/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py +++ b/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py @@ -10,9 +10,9 @@ """Antithesis driver for property `upsert-key-reflects-latest-value`. -For each key produced to a Kafka UPSERT-envelope source, after a quiet period -that lets Materialize catch up, the source's row for that key must reflect the -last value produced — or be absent if the last message was a tombstone. +For each key produced to a Kafka UPSERT-envelope source, once Materialize +catches up, the source's row for that key must reflect the last value +produced — or be absent if the last message was a tombstone. Each invocation: 1. Ensures the upsert source exists (idempotent CREATE ... IF NOT EXISTS). @@ -20,8 +20,9 @@ interfere with each other's expected-state model. 3. Produces a deterministic mix of upserts and tombstones, tracking the local "what should the source say" model. - 4. Requests an Antithesis quiet period and waits for offset_committed to - reach the highest produced offset. + 4. Waits for offset_committed to reach the highest produced offset. The + global fault-orchestrator drives quiet/active windows; this driver + just polls until catchup completes or the budget expires. 5. For every tracked key, asserts that what's in the source matches the local model. Live keys use one assertion message, tombstoned keys use another, so triage can distinguish the two failure modes. @@ -37,10 +38,8 @@ import sys import helper_random -from antithesis.assertions import always, sometimes from helper_kafka import make_producer from helper_pg import query_one_retry -from helper_quiet import request_quiet_period from helper_source_stats import wait_for_catchup from helper_upsert_source import ( SOURCE_UPSERT_TEXT, @@ -48,6 +47,8 @@ ensure_upsert_text_source, ) +from antithesis.assertions import always, sometimes + logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" ) @@ -67,8 +68,10 @@ # budget on the same workload shape. TOMBSTONE_PROB_RANGE = (0.05, 0.50) -QUIET_PERIOD_S = 20 -CATCHUP_TIMEOUT_S = 60.0 +# Sized to span at least one MAX_OFF window from the global fault- +# orchestrator (default 40s) plus the time the upsert source needs to +# advance offset_committed past our produces. +CATCHUP_TIMEOUT_S = 90.0 def _produce(producer, tracker, topic: str, key: str, value: str | None) -> None: @@ -125,9 +128,7 @@ def main() -> int: # range. The fuzzer sees this as one of the first decisions in the # timeline and can drive it toward whichever extreme reveals a bug. tombstone_prob = helper_random.random_float(*TOMBSTONE_PROB_RANGE) - LOG.info( - "driver starting; prefix=%s tombstone_prob=%.3f", prefix, tombstone_prob - ) + LOG.info("driver starting; prefix=%s tombstone_prob=%.3f", prefix, tombstone_prob) producer, tracker = make_producer(client_id=f"antithesis-{prefix}") @@ -178,8 +179,9 @@ def main() -> int: LOG.info("no messages confirmed delivered this invocation; exiting cleanly") return 0 - # Now ask Antithesis to pause faults and wait for Materialize to catch up. - request_quiet_period(QUIET_PERIOD_S) + # Wait for Materialize to catch up. Quiet windows are driven globally by + # the fault-orchestrator service; this catchup timeout is sized to span + # at least one such window so the source can advance during it. caught_up = wait_for_catchup( SOURCE_UPSERT_TEXT, max_produced, timeout_s=CATCHUP_TIMEOUT_S ) @@ -189,7 +191,7 @@ def main() -> int: # vacuous and the run is uninteresting. sometimes( caught_up, - "upsert: source caught up to produced offsets after quiet period", + "upsert: source caught up to produced offsets within catchup budget", {"source": SOURCE_UPSERT_TEXT, "target_offset": max_produced}, ) diff --git a/test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py b/test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py index 5612a19c30ea8..fd9c7cf389001 100755 --- a/test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py +++ b/test/antithesis/workload/test/singleton_driver_catalog_recovery_consistency.py @@ -53,7 +53,6 @@ import helper_random import psycopg -from antithesis.assertions import always, sometimes from helper_pg import ( PGDATABASE, PGHOST, @@ -63,6 +62,8 @@ query_retry, ) +from antithesis.assertions import always, sometimes + logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" ) diff --git a/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py b/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py index 3c9876ba79988..3be0de672c626 100755 --- a/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py +++ b/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py @@ -25,8 +25,10 @@ Each cycle: 1. Produce a batch of (key, value) and (key, null) messages, updating the in-memory `expected_state` model. - 2. Request a quiet period and wait for `offset_committed` to reach the - highest produced offset. + 2. Wait for `offset_committed` to reach the highest produced offset. + The global fault-orchestrator drives quiet/active windows on its + own cadence; the per-cycle catchup timeout is sized to span at + least one quiet window so settle has somewhere to land. 3. SELECT every tracked key's current source state and assert it matches `expected_state` via `always("upsert: rehydrated state equals local model", ...)`. Across-cycle stability is exactly what @@ -38,15 +40,15 @@ A previous version of this driver also recorded a "clusterd observed non-online" `sometimes` anchor via a once-per-cycle SELECT of -`mz_internal.mz_cluster_replica_statuses`. That assertion was structurally -unable to fire here: each cycle requests a 25-second Antithesis quiet -period before its assertions, the probe runs *after* the quiet period -(when faults are paused and killed containers have been restored), and -the introspection view itself lags clusterd death by the -orchestrator-process 5-second poll. The "did we see a replica go -offline" signal lives in `anytime_fault_recovery_exercised.py` instead, -which polls continuously and never requests a quiet period, so it has -the right shape to observe the offline window. +`mz_internal.mz_cluster_replica_statuses`. That assertion was +structurally unable to fire here: when faults are paused (either by +the old per-driver `ANTITHESIS_STOP_FAULTS` calls or by the new global +fault-orchestrator's quiet window) killed containers are restored +before the probe runs, and the introspection view itself lags clusterd +death by the orchestrator-process 5-second poll. The "did we see a +replica go offline" signal lives in `anytime_fault_recovery_exercised.py` +instead, which polls continuously and is unaffected by quiet windows, +so it has the right shape to observe the offline window. Distinct prefix per timeline keeps multiple parallel timelines independent. """ @@ -58,10 +60,8 @@ import time import helper_random -from antithesis.assertions import always, sometimes from helper_kafka import make_producer from helper_pg import query_one_retry -from helper_quiet import request_quiet_period from helper_source_stats import wait_for_catchup from helper_upsert_source import ( SOURCE_UPSERT_TEXT, @@ -69,6 +69,8 @@ ensure_upsert_text_source, ) +from antithesis.assertions import always, sometimes + logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s" ) @@ -90,8 +92,10 @@ # rehydration, not just per-cycle convergence. TOMBSTONE_PROB_RANGE = (0.05, 0.50) -QUIET_PERIOD_S = 25 -CATCHUP_TIMEOUT_S = 120.0 +# Sized to span at least one MAX_OFF window from the global fault- +# orchestrator (default 40s) and survive a clusterd restart inside it; +# rehydration after a kill is the whole point of this driver. +CATCHUP_TIMEOUT_S = 180.0 INTER_CYCLE_SLEEP_S = 2.0 @@ -167,7 +171,8 @@ def _run_cycle( LOG.info("cycle %d: no messages confirmed delivered; skipping", cycle_idx) return False - request_quiet_period(QUIET_PERIOD_S) + # The global fault-orchestrator drives quiet windows; this catchup + # timeout is sized to span one and survive a clusterd kill in it. caught_up = wait_for_catchup( SOURCE_UPSERT_TEXT, max_produced, timeout_s=CATCHUP_TIMEOUT_S ) @@ -234,7 +239,7 @@ def main() -> int: # The "did this run actually span a clusterd restart" anchor is # deliberately not in this driver — see the module docstring. The # `cycles_run >= 2` check below is the rehydration-coverage anchor: - # without two post-quiet-period reads, the safety assertions could + # without two settle-then-read cycles, the safety assertions could # be vacuously satisfied by a single early settle. sometimes( cycles_run >= 2, From cc65e8eee969ee56a0a594d4ef172e8b5ab9485f Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Thu, 14 May 2026 17:23:25 -0400 Subject: [PATCH 58/65] test/antithesis: bump connect/retry timeouts to span fault-orchestrator windows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The global fault-orchestrator alternates faults-ON/OFF windows of up to MAX_ON / MAX_OFF seconds each (defaults 40s, set in the FaultOrchestrator service). With the previous timeouts a single connect attempt or producer flush could expire entirely inside one 40s faults-ON window — fast-failing before the orchestrator opened the next quiet window and burning retry budget on TCP timeouts. helper_pg: CONNECT_TIMEOUT_S 15 -> 30 (renamed from _CONNECT_TIMEOUT_S so the parallel-workload driver can reuse it instead of hardcoding 15) _RETRY_BUDGET_S 120 -> 180 (spans one full ON+OFF cycle + margin) helper_mysql: same logic — same values. Replication adds primary→replica hops so the budgets match helper_pg's. `wait_for_host`'s 5s probe stays short: it runs at bootstrap before fault injection begins. helper_kafka: new explicit librdkafka producer config — `request.timeout.ms=60000`, `delivery.timeout.ms=180000`. New module- level `ADMIN_TIMEOUT_S=90` for `admin.list_topics` and `create_topics` result waits; new `FLUSH_TIMEOUT_S=90` exported for drivers so `producer.flush(timeout=...)` waits past a single MAX_ON window before declaring pending messages "skipping assertions" material. Per-driver direct psycopg.connect in parallel_driver_parallel_workload (3 sites) now use `CONNECT_TIMEOUT_S` instead of literal 15. The four Kafka-source drivers' `producer.flush(timeout=30)` calls now use `FLUSH_TIMEOUT_S` from helper_kafka. Probe timeouts are intentionally kept short — they exist to *measure* unavailability, not wait through it: anytime_fault_recovery_exercised.PROBE_CONNECT_TIMEOUT_S = 2.0 singleton_driver_catalog_recovery_consistency.PROBE_CONNECT_TIMEOUT_S = 2.0 parallel_driver_strict_serializable_reads.PROBE_CONNECT_TIMEOUT_S = 5 Co-Authored-By: Claude Opus 4.7 (1M context) --- test/antithesis/workload/test/helper_kafka.py | 36 +++++++++++++++++-- test/antithesis/workload/test/helper_mysql.py | 12 +++++-- test/antithesis/workload/test/helper_pg.py | 29 +++++++++------ .../parallel_driver_kafka_none_envelope.py | 4 +-- .../test/parallel_driver_parallel_workload.py | 7 ++-- ...llel_driver_upsert_ancient_key_writable.py | 4 +-- .../parallel_driver_upsert_latest_value.py | 4 +-- ...ngleton_driver_upsert_state_rehydration.py | 4 +-- 8 files changed, 74 insertions(+), 26 deletions(-) diff --git a/test/antithesis/workload/test/helper_kafka.py b/test/antithesis/workload/test/helper_kafka.py index a9bf2eac600a1..3486ce79454cd 100644 --- a/test/antithesis/workload/test/helper_kafka.py +++ b/test/antithesis/workload/test/helper_kafka.py @@ -28,6 +28,34 @@ BROKER = os.environ.get("KAFKA_BROKER", "kafka:9092") +# Per-RPC and per-delivery timeouts for librdkafka. Default +# `request.timeout.ms` is 30s, which can fail entirely inside a single +# faults-ON window (MAX_ON defaults to 40s in the global fault- +# orchestrator). Bumping it gives one request a real chance of spanning +# the transition into the next quiet window before failing. librdkafka +# also requires `delivery.timeout.ms` to be >= `request.timeout.ms + +# linger.ms`; we pin both explicitly so the relationship is reviewable +# here rather than implicit. `delivery.timeout.ms` is the wall-clock +# budget the broker side of the producer has to either deliver or fail +# the message; idempotent retries happen under this umbrella, so the +# value needs to span at least one full ON+OFF cycle (~80s) plus +# margin. +_REQUEST_TIMEOUT_MS = 60_000 +_DELIVERY_TIMEOUT_MS = 180_000 + +# Wall-clock budget for synchronous admin / flush waits. The orchestrator's +# longest faults-ON window is MAX_ON (40s default); 90s comfortably spans +# one such window plus catchup overhead. +ADMIN_TIMEOUT_S = 90 + +# Wall-clock budget for `producer.flush(timeout=...)` in drivers. Tuned to +# absorb at least one MAX_ON window so a produce burst that landed mid- +# fault still has time to drain after the orchestrator opens its next +# quiet window. Shorter than `_DELIVERY_TIMEOUT_MS` so a flush that +# returns with `pending > 0` is a strong signal the producer is still +# struggling, not that we just ran out of patience. +FLUSH_TIMEOUT_S = 90 + @dataclass class DeliveryTracker: @@ -62,6 +90,10 @@ def make_producer(client_id: str | None = None) -> tuple[Producer, DeliveryTrack "linger.ms": 5, "enable.idempotence": True, "acks": "all", + # See module-level _REQUEST_TIMEOUT_MS / _DELIVERY_TIMEOUT_MS for + # the fault-orchestrator-aware rationale on these values. + "request.timeout.ms": _REQUEST_TIMEOUT_MS, + "delivery.timeout.ms": _DELIVERY_TIMEOUT_MS, } if client_id: config["client.id"] = client_id @@ -71,7 +103,7 @@ def make_producer(client_id: str | None = None) -> tuple[Producer, DeliveryTrack def ensure_topic(topic: str, num_partitions: int = 1) -> None: """Create the topic if it doesn't already exist. No-op on race with auto-create.""" admin = AdminClient({"bootstrap.servers": BROKER}) - existing = admin.list_topics(timeout=10).topics + existing = admin.list_topics(timeout=ADMIN_TIMEOUT_S).topics if topic in existing: return LOG.info("creating kafka topic %s with %d partition(s)", topic, num_partitions) @@ -80,7 +112,7 @@ def ensure_topic(topic: str, num_partitions: int = 1) -> None: ) for t, fut in futures.items(): try: - fut.result(timeout=30) + fut.result(timeout=ADMIN_TIMEOUT_S) except KafkaException as exc: # TOPIC_ALREADY_EXISTS = 36 err = exc.args[0] if exc.args else None diff --git a/test/antithesis/workload/test/helper_mysql.py b/test/antithesis/workload/test/helper_mysql.py index f9b79395c556a..35f573186b8f7 100644 --- a/test/antithesis/workload/test/helper_mysql.py +++ b/test/antithesis/workload/test/helper_mysql.py @@ -30,7 +30,15 @@ MYSQL_PORT = int(os.environ.get("MYSQL_PORT", "3306")) MYSQL_PASSWORD = os.environ.get("MYSQL_PASSWORD", "p@ssw0rd") -_RETRY_BUDGET_S = 120 +# See helper_pg for the rationale on these values. The global fault- +# orchestrator's MAX_ON/MAX_OFF defaults (40s each) mean a per-attempt +# connect_timeout shorter than ~MAX_ON will fast-fail entirely inside a +# faults-ON window, and a retry budget shorter than ~one full ON+OFF cycle +# won't give an attempt a chance to land in the next quiet window. MySQL +# also adds the primary→replica replication path, so the budget is sized +# the same as helper_pg's. +_CONNECT_TIMEOUT_S = 30 +_RETRY_BUDGET_S = 180 _RETRY_INITIAL_S = 0.5 _RETRY_MAX_S = 4.0 @@ -51,7 +59,7 @@ def _open(host: str, database: str) -> pymysql.connections.Connection: user="root", password=MYSQL_PASSWORD, database=database, - connect_timeout=15, + connect_timeout=_CONNECT_TIMEOUT_S, autocommit=True, ) except Exception as exc: # noqa: BLE001 diff --git a/test/antithesis/workload/test/helper_pg.py b/test/antithesis/workload/test/helper_pg.py index 59a88f1963ab3..ac6fada801506 100644 --- a/test/antithesis/workload/test/helper_pg.py +++ b/test/antithesis/workload/test/helper_pg.py @@ -38,15 +38,22 @@ # Retry tuning. Antithesis injects partitions and node hangs; conservative bounds # keep drivers progressing without masking real correctness signals. # -# These need to absorb a full Antithesis quiet period plus restart time for the -# system to come back. Quiet-period requests in the workload are typically -# 20-25s; the container then takes a few seconds to become responsive, so the -# overall budget must comfortably exceed ~30s. The per-attempt connect timeout -# also has to be long enough to actually complete a TCP+TLS handshake against -# a hung but recovering materialized — too short and every attempt fails fast -# and the budget is burned without giving the system a chance to answer. -_CONNECT_TIMEOUT_S = 15 -_RETRY_BUDGET_S = 120 +# The global fault-orchestrator alternates faults-ON/OFF windows of up to +# MAX_ON / MAX_OFF seconds each (defaults 40s, defined in +# test/antithesis/mzcompose.py FaultOrchestrator). One full +# fault-ON+fault-OFF cycle is up to MAX_ON+MAX_OFF ~= 80s. +# +# Per-attempt connect_timeout must be long enough that an attempt starting +# late in a faults-ON window has a real chance of completing across the +# transition into the next faults-OFF window. A 15s timeout entirely inside +# a 40s faults-ON window fast-fails before the orchestrator opens a quiet +# period, burning retry budget on TCP timeouts rather than waiting for +# materialized to be reachable. +# +# Retry budget must comfortably span at least one full ON+OFF cycle plus +# margin for the system to actually respond once faults pause. +CONNECT_TIMEOUT_S = 30 +_RETRY_BUDGET_S = 180 _RETRY_INITIAL_S = 0.1 _RETRY_MAX_S = 2.0 @@ -72,7 +79,7 @@ def connect(autocommit: bool = True) -> Iterator[psycopg.Connection]: port=PGPORT, user=PGUSER, dbname=PGDATABASE, - connect_timeout=_CONNECT_TIMEOUT_S, + connect_timeout=CONNECT_TIMEOUT_S, autocommit=autocommit, ) break @@ -169,7 +176,7 @@ def execute_internal_retry(sql: str, params: Sequence[Any] | None = None) -> Non port=PGPORT_INTERNAL, user=PGUSER_INTERNAL, dbname=PGDATABASE, - connect_timeout=_CONNECT_TIMEOUT_S, + connect_timeout=CONNECT_TIMEOUT_S, autocommit=True, ) as conn, conn.cursor() as cur, diff --git a/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py b/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py index 656a6b6b6d776..cfda086d9acc7 100755 --- a/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py +++ b/test/antithesis/workload/test/parallel_driver_kafka_none_envelope.py @@ -47,7 +47,7 @@ import sys import helper_random -from helper_kafka import make_producer +from helper_kafka import FLUSH_TIMEOUT_S, make_producer from helper_none_source import ( SOURCE_NONE_TEXT, TOPIC_NONE_TEXT, @@ -95,7 +95,7 @@ def main() -> int: expected_payloads.add(payload) producer.poll(0) - pending = producer.flush(timeout=30) + pending = producer.flush(timeout=FLUSH_TIMEOUT_S) if pending > 0 or tracker.last_error is not None: # Same fail-closed pattern as the upsert driver: under sustained # fault injection we cannot prove which messages Kafka accepted, so diff --git a/test/antithesis/workload/test/parallel_driver_parallel_workload.py b/test/antithesis/workload/test/parallel_driver_parallel_workload.py index 3e698eeb1e660..6a73f39b4be62 100644 --- a/test/antithesis/workload/test/parallel_driver_parallel_workload.py +++ b/test/antithesis/workload/test/parallel_driver_parallel_workload.py @@ -48,6 +48,7 @@ import helper_random import psycopg from helper_pg import ( + CONNECT_TIMEOUT_S, PGDATABASE, PGHOST, PGPORT, @@ -141,7 +142,7 @@ def _prepare_system(num_threads: int) -> None: user=PGUSER_INTERNAL, dbname=PGDATABASE, autocommit=True, - connect_timeout=15, + connect_timeout=CONNECT_TIMEOUT_S, ) as conn, conn.cursor() as cur, ): @@ -334,7 +335,7 @@ def _drop_seed_scoped_objects(seed: str) -> None: user=PGUSER, dbname=PGDATABASE, autocommit=True, - connect_timeout=15, + connect_timeout=CONNECT_TIMEOUT_S, ) as conn, conn.cursor() as cur, ): @@ -579,7 +580,7 @@ def _run_invocation( user=PGUSER, dbname=PGDATABASE, autocommit=True, - connect_timeout=15, + connect_timeout=CONNECT_TIMEOUT_S, ) as setup_conn, setup_conn.cursor() as setup_cur, ): diff --git a/test/antithesis/workload/test/parallel_driver_upsert_ancient_key_writable.py b/test/antithesis/workload/test/parallel_driver_upsert_ancient_key_writable.py index 84a9a47369a50..8ea69f67c04b1 100644 --- a/test/antithesis/workload/test/parallel_driver_upsert_ancient_key_writable.py +++ b/test/antithesis/workload/test/parallel_driver_upsert_ancient_key_writable.py @@ -59,7 +59,7 @@ import sys import helper_random -from helper_kafka import make_producer +from helper_kafka import FLUSH_TIMEOUT_S, make_producer from helper_pg import query_retry from helper_source_stats import wait_for_catchup from helper_upsert_source import ( @@ -173,7 +173,7 @@ def main() -> int: _produce(producer, tracker, TOPIC_UPSERT_TEXT, key, new_value) producer.poll(0) - pending = producer.flush(timeout=30) + pending = producer.flush(timeout=FLUSH_TIMEOUT_S) if pending > 0 or tracker.last_error is not None: # Under sustained fault injection we can't prove which produces # Kafka accepted. Bail before asserting — "writes that landed got diff --git a/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py b/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py index 7e3032258dee5..125e71b7c114f 100755 --- a/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py +++ b/test/antithesis/workload/test/parallel_driver_upsert_latest_value.py @@ -38,7 +38,7 @@ import sys import helper_random -from helper_kafka import make_producer +from helper_kafka import FLUSH_TIMEOUT_S, make_producer from helper_pg import query_one_retry from helper_source_stats import wait_for_catchup from helper_upsert_source import ( @@ -159,7 +159,7 @@ def main() -> int: # Flush all pending deliveries. We poll callbacks while flushing so the # tracker reflects the true max produced offset. - pending = producer.flush(timeout=30) + pending = producer.flush(timeout=FLUSH_TIMEOUT_S) if pending > 0 or tracker.last_error is not None: # Under sustained fault injection we cannot prove which of the just- # produced messages Kafka actually accepted, so `expected` may name diff --git a/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py b/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py index 3be0de672c626..58e1de5c18ac4 100755 --- a/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py +++ b/test/antithesis/workload/test/singleton_driver_upsert_state_rehydration.py @@ -60,7 +60,7 @@ import time import helper_random -from helper_kafka import make_producer +from helper_kafka import FLUSH_TIMEOUT_S, make_producer from helper_pg import query_one_retry from helper_source_stats import wait_for_catchup from helper_upsert_source import ( @@ -156,7 +156,7 @@ def _run_cycle( expected[key] = value producer.poll(0) - pending = producer.flush(timeout=30) + pending = producer.flush(timeout=FLUSH_TIMEOUT_S) if pending > 0 or tracker.last_error is not None: LOG.info( "cycle %d: skipping assertions; flush pending=%d last_error=%s", From bb7c5cb924fedb522dbde7ee7d8b05534306fbd5 Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Thu, 14 May 2026 18:42:00 -0400 Subject: [PATCH 59/65] test/antithesis: fault-orchestrator: bash -s -> bash -c so script actually runs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The FaultOrchestrator service was wired up with `entrypoint: ["bash", "-s"]` and the script body passed as `command`. But `bash -s` reads commands from stdin — and there's no stdin in a detached docker container, so bash exited immediately with no output and the script string was silently used as `$0`. Net effect: the orchestrator container started, exited cleanly, and ANTITHESIS_STOP_FAULTS was never called. Antithesis fault injection ran unconstrained for the entire run, with no quiet windows ever opening. Every driver that needed more than one connection (the four Kafka drivers do CREATE CONNECTION + admin metadata fetch + CREATE SOURCE; the two MySQL drivers do primary writes + MZ reads; the parallel-workload driver does multi-step setup; the strict- serializable driver opens many fresh psycopg connects) effectively starved. Only `parallel_driver_mv_reflects_table_updates.py` ever reached its "driver done" log line: it does one batched INSERT and then polls materialize on a single retried connection, so a brief calm in the faults occasionally let it through. Fix: use `bash -c