From e2052858fd982530d554ec9550eb3294f750b941 Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Tue, 19 May 2026 11:43:15 +0200 Subject: [PATCH 01/13] move AssertNodeLogs to t_helpers, use docker logs streamer; add wf caching soak test and workflow --- .github/workflows/cre-wf-caching-test.yml | 215 ++++++++++++++++++ system-tests/tests/.gitignore | 1 + .../tests/smoke/cre/v2_module_cache_test.go | 33 +-- system-tests/tests/soak/cre/cache_test.go | 147 ++++++++++++ .../tests/test-helpers/container_logs.go | 91 ++++++++ 5 files changed, 455 insertions(+), 32 deletions(-) create mode 100644 .github/workflows/cre-wf-caching-test.yml create mode 100644 system-tests/tests/soak/cre/cache_test.go create mode 100644 system-tests/tests/test-helpers/container_logs.go diff --git a/.github/workflows/cre-wf-caching-test.yml b/.github/workflows/cre-wf-caching-test.yml new file mode 100644 index 00000000000..0de3ad3e237 --- /dev/null +++ b/.github/workflows/cre-wf-caching-test.yml @@ -0,0 +1,215 @@ +name: CRE Workflow Caching Test + +on: + workflow_dispatch: + inputs: + ecr_name: + description: "ECR repository name (default: chainlink)" + required: false + type: string + default: "chainlink" + chainlink_image_tag: + description: "Chainlink image tag to use" + required: true + type: string + workflow_call: + inputs: + ecr_name: + required: false + type: string + default: "chainlink" + chainlink_image_tag: + required: true + type: string + # TODO - remove after testing + pull_request: + +jobs: + caching: + name: CRE Workflow Caching Test + environment: + name: integration + deployment: false + runs-on: runs-on=${{ github.run_id + }}/cpu=32/ram=128/family=m6id+m6idn/spot=false/image=ubuntu24-full-x64/extras=s3-cache + # TODO Adjust to actual observation time + timeout-minutes: 270 # 4h30m — test timeout is 4h20m + permissions: + contents: read + id-token: write + + steps: + - name: Enable S3 Cache for Self-Hosted Runners + uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0 + with: + metrics: cpu,network,memory,disk + + - name: Checkout + uses: actions/checkout@v6 + with: + ref: ${{ github.sha }} + persist-credentials: false + + - name: Set up Go + id: setup-go + uses: actions/setup-go@v6 + with: + go-version-file: system-tests/tests/go.mod + cache: true + + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@010d0da01d0b5a38af31e9c3470dbfdabdecca3a # v4.0.1 + with: + aws-region: ${{ secrets.QA_AWS_REGION }} + role-to-assume: ${{ secrets.AWS_CTF_READ_ACCESS_ROLE_ARN }} + role-duration-seconds: 3600 + mask-aws-account-id: true + + - name: Login to Amazon ECR + id: login-ecr + uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1 + with: + registries: ${{ format('{0},{1}', secrets.QA_AWS_ACCOUNT_NUMBER, + secrets.AWS_ACCOUNT_ID_PROD) }} + env: + AWS_REGION: ${{ secrets.QA_AWS_REGION }} + + - name: Start observability stack + shell: bash + working-directory: core/scripts/cre/environment + env: + OBS_MAX_ATTEMPTS: "3" + OBS_RETRY_DELAY_SECONDS: "15" + run: | + set -u + attempt=1 + while [[ "$attempt" -le "$OBS_MAX_ATTEMPTS" ]]; do + echo "::group::Starting observability stack (attempt ${attempt}/${OBS_MAX_ATTEMPTS}, required by caching test)" + if go run . obs up -f; then + echo "::endgroup::" + exit 0 + fi + echo "::endgroup::" + if [[ "$attempt" -lt "$OBS_MAX_ATTEMPTS" ]]; then + go run . obs down || true + sleep "$OBS_RETRY_DELAY_SECONDS" + fi + attempt=$((attempt + 1)) + done + exit 1 + + - name: Start local CRE + uses: ./.github/actions/start-local-cre-environment + with: + jd-image: + "${{ secrets.AWS_ACCOUNT_ID_PROD }}.dkr.ecr.${{ secrets.QA_AWS_REGION + }}.amazonaws.com/job-distributor:0.22.1" + # TODO - remove nightly image fallback after testing + chainlink-image: "${{ secrets.QA_AWS_ACCOUNT_NUMBER }}.dkr.ecr.${{ + secrets.QA_AWS_REGION }}.amazonaws.com/${{ inputs.ecr_name || + 'chainlink' }}:${{ inputs.chainlink_image_tag != '' && inputs.chainlink_image_tag || 'nightly-20260519-plugins' }}" + ctf-configs: configs/workflow-gateway-don-cache-test.toml + chip-router-image: "${{ secrets.QA_AWS_ACCOUNT_NUMBER + }}.dkr.ecr.${{secrets.QA_AWS_REGION + }}.amazonaws.com/local-cre-chip-router:v1.0.1" + retry-count: "3" + retry-delay-seconds: "15" + cleanup-on-error: "false" + working-directory: core/scripts/cre/environment + + - name: Install gotestsum + shell: bash + run: go install gotest.tools/gotestsum@v1.12.3 + + - name: Run CRE Workflow Caching Test + id: run-caching + shell: bash + working-directory: system-tests/tests + env: + CTF_CHIP_INGRESS_IMAGE: + "${{ secrets.AWS_ACCOUNT_ID_PROD }}.dkr.ecr.${{ + secrets.QA_AWS_REGION + }}.amazonaws.com/atlas-chip-ingress:da84cb72d3a160e02896247d46ab4b9\ + 806ebee2f" + CTF_CHIP_CONFIG_IMAGE: "${{ secrets.AWS_ACCOUNT_ID_PROD }}.dkr.ecr.${{ + secrets.QA_AWS_REGION + }}.amazonaws.com/atlas-chip-config:7b4e9ee68fd1c737dd3480b5a3ced018\ + 8f29b969" + CRE_SOAK_CACHE_OBSERVE_WINDOW: "10m" + CRE_SOAK_NUM_WORKFLOWS: "20" + # TODO - adjust test timeout to actual observation time + run: | + gotestsum \ + --jsonfile=/tmp/gotest.log \ + --junitfile=/tmp/junit-report.xml \ + --format=github-actions \ + -- \ + -v -run "^Test_V2_CRE_CacheSoak$" \ + -timeout 4h20m \ + -count=1 \ + github.com/smartcontractkit/chainlink/system-tests/tests/soak/cre + + - name: Upload metrics + if: always() + uses: actions/upload-artifact@v7 + with: + name: metrics + path: system-tests/tests/soak/cre/metrics + retention-days: 7 + + - name: Upload Docker logs + if: failure() + uses: actions/upload-artifact@v7 + with: + name: caching-docker-logs + path: system-tests/tests/soak/cre/logs + retention-days: 7 + + # notify-test-failure: + # name: Notify about test Failure + # #if: failure() + # if: false # TODO: Silence for now + # needs: [soak] + # environment: + # name: integration + # deployment: false + # runs-on: ubuntu-latest + # steps: + # - name: Send slack notification for failed resource regression tests + # id: send-slack-notification + # uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0 + # with: + # errors: "true" + # method: chat.postMessage + # token: ${{ secrets.QA_SLACK_API_KEY }} + # payload: | + # { + # "channel": "C0117GGJB6Y", + # "text": "CRE Resource Regression Test Failed. Investigation is required.", + # "blocks": [ + # { + # "type": "section", + # "text": { + # "type": "mrkdwn", + # "text": "*:rotating_light: CRE Resource Regression Test Failed. Investigation is required. :rotating_light:*" + # } + # }, + # { + # "type": "section", + # "text": { + # "type": "mrkdwn", + # "text": "Alerting <@U033CGL3CDT>, CRE Resource Regression Test failed for commit <${{ github.server_url }}/${{ github.repository }}/commit/${{ github.sha }}|${{ github.sha }}> on run ID <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|${{ github.run_id }}>. Please investigate." + # } + # }, + # { + # "type": "divider" + # }, + # { + # "type": "section", + # "text": { + # "type": "mrkdwn", + # "text": "Chainlink nodes used more resources than expected. Check the Docker logs and the alloc.pprof file." + # } + # } + # ] + # } diff --git a/system-tests/tests/.gitignore b/system-tests/tests/.gitignore index 34583fa9843..1e97855d496 100644 --- a/system-tests/tests/.gitignore +++ b/system-tests/tests/.gitignore @@ -23,3 +23,4 @@ logs/ *.yaml env_artifact/ +metrics/ diff --git a/system-tests/tests/smoke/cre/v2_module_cache_test.go b/system-tests/tests/smoke/cre/v2_module_cache_test.go index 8484c6ed678..43aed9e5959 100644 --- a/system-tests/tests/smoke/cre/v2_module_cache_test.go +++ b/system-tests/tests/smoke/cre/v2_module_cache_test.go @@ -3,13 +3,9 @@ package cre import ( "context" "fmt" - "os/exec" - "strings" "testing" "time" - "github.com/stretchr/testify/assert" - commonevents "github.com/smartcontractkit/chainlink-protos/workflows/go/common" workflowevents "github.com/smartcontractkit/chainlink-protos/workflows/go/events" "github.com/smartcontractkit/chainlink-testing-framework/framework" @@ -68,7 +64,7 @@ func ExecuteModuleCacheTest(t *testing.T, testEnv *ttypes.TestEnvironment) { drainFor(userLogsCh, cacheObserveWindow) - assertNodeLogs(t, testEnv, "Module cache enabled") + t_helpers.AssertNodeLogs(t, testEnv, "Module cache enabled") } func drainFor(ch <-chan *workflowevents.UserLogs, d time.Duration) { @@ -81,30 +77,3 @@ func drainFor(ch <-chan *workflowevents.UserLogs, d time.Duration) { } } } - -func assertNodeLogs(t *testing.T, testEnv *ttypes.TestEnvironment, needle string) { - t.Helper() - - found := false - for _, nodeSet := range testEnv.Config.NodeSets { - if nodeSet.Out == nil { - continue - } - for _, clNode := range nodeSet.Out.CLNodes { - name := clNode.Node.ContainerName - if name == "" { - continue - } - out, err := exec.CommandContext(t.Context(), "docker", "logs", name).CombinedOutput() - if err != nil { - framework.L.Warn().Str("container", name).Err(err).Msg("could not read docker logs") - continue - } - if strings.Contains(string(out), needle) { - found = true - framework.L.Info().Str("container", name).Msg("confirmed: " + needle) - } - } - } - assert.True(t, found, "expected at least one node container log to contain %q", needle) -} diff --git a/system-tests/tests/soak/cre/cache_test.go b/system-tests/tests/soak/cre/cache_test.go new file mode 100644 index 00000000000..92666f02e98 --- /dev/null +++ b/system-tests/tests/soak/cre/cache_test.go @@ -0,0 +1,147 @@ +package cre + +import ( + "context" + "encoding/json" + "fmt" + "os" + "path/filepath" + "strconv" + "testing" + "time" + + commonevents "github.com/smartcontractkit/chainlink-protos/workflows/go/common" + workflowevents "github.com/smartcontractkit/chainlink-protos/workflows/go/events" + "github.com/smartcontractkit/chainlink-testing-framework/framework" + "github.com/stretchr/testify/require" + + crontypes "github.com/smartcontractkit/chainlink/core/scripts/cre/environment/examples/workflows/cron/types" + + "github.com/smartcontractkit/chainlink/system-tests/lib/cre" + t_helpers "github.com/smartcontractkit/chainlink/system-tests/tests/test-helpers" +) + +func Test_V2_CRE_CacheSoak(t *testing.T) { + numWorkflows := 10 + if os.Getenv("CRE_SOAK_NUM_WORKFLOWS") != "" { + var err error + numWorkflows, err = strconv.Atoi(os.Getenv("CRE_SOAK_NUM_WORKFLOWS")) + if err != nil { + t.Fatalf("failed to parse CRE_SOAK_NUM_WORKFLOWS: %v", err) + } + } + + cacheObserveWindow := 2 * time.Minute + if os.Getenv("CRE_SOAK_CACHE_OBSERVE_WINDOW") != "" { + var err error + cacheObserveWindow, err = time.ParseDuration(os.Getenv("CRE_SOAK_CACHE_OBSERVE_WINDOW")) + if err != nil { + t.Fatalf("failed to parse CRE_SOAK_CACHE_OBSERVE_WINDOW: %v", err) + } + } + + testEnv := t_helpers.SetupTestEnvironmentWithConfig(t, t_helpers.GetTestConfig(t, "/configs/workflow-gateway-don-cache-test.toml")) + testLogger := framework.L + + userLogsCh := make(chan *workflowevents.UserLogs, 1000) + baseMessageCh := make(chan *commonevents.BaseMessage, 1000) + + server := t_helpers.StartChipTestSink(t, t_helpers.GetPublishFn(testLogger, userLogsCh, baseMessageCh)) + t.Cleanup(func() { + // Do not use t.Context() here: it is cancelled before cleanup runs, which breaks chip-router + // unregister and can leave gRPC Publish blocked on full log channels after WatchWorkflowLogs returns. + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + t_helpers.ShutdownChipSinkWithDrain(ctx, server, userLogsCh, baseMessageCh) + }) + + workflowFileLocation := "../../../../core/scripts/cre/environment/examples/workflows/cron/main.go" + + workflowConfig := crontypes.WorkflowConfig{ + Schedule: "*/30 * * * * *", + } + + startTime := time.Now() + for i := range numWorkflows { + t_helpers.CompileAndDeployWorkflow(t, testEnv, testLogger, fmt.Sprintf("cachetest%d", i), &workflowConfig, workflowFileLocation) + } + testLogger.Info().Int("count", numWorkflows).Msg("All cache-test workflows deployed") + + t_helpers.WatchWorkflowLogs(t, testLogger, userLogsCh, baseMessageCh, t_helpers.WorkflowEngineInitErrorLog, "Amazing workflow user log", 2*time.Minute) + testLogger.Info().Dur("window", cacheObserveWindow).Msg("First workflow execution confirmed, observing cache activity...") + + // t_helpers.AssertNodeLogs(t, testEnv, "Module cache enabled") + endTime := time.Now() + + // Check Prometheus metrics + pc := framework.NewPrometheusQueryClient(framework.LocalPrometheusBaseURL) + + workflowDONs := testEnv.Dons.DonsWithFlag(cre.WorkflowDON) + require.NotEmpty(t, workflowDONs, "no workflow DONs found") + + type wrappedQueryRangeResponse struct { + NodeName string `json:"node_name"` + framework.QueryRangeResponse + } + + type metric struct { + query string + filename string + } + + metrics := []metric{ + { + query: "increase(platform_workflow_module_cache_eviction_total{node_don=\"%s\", node_index=\"%d\"}[1m])", + filename: "metrics/cache_eviction_increase.json", + }, + { + query: "sum by (source) (increase(platform_workflow_module_cache_reload_total{node_don=\"%s\", node_index=\"%d\"}[1m]))", + filename: "metrics/cache_reload_increase.json", + }, + { + query: "increase(platform_workflow_module_cache_memory_saved_bytes{node_don=\"%s\", node_index=\"%d\"}[1m])", + filename: "metrics/cache_memory_saved_bytes.json", + }, + } + + for _, metric := range metrics { + results := make([]wrappedQueryRangeResponse, 0) + for _, don := range workflowDONs { + for _, node := range don.Nodes { + query := fmt.Sprintf(metric.query, don.Name, node.Index) + fmt.Println("query:", query) + queryResponse, err := pc.QueryRange(framework.QueryRangeParams{ + Query: query, + Start: startTime, + End: endTime, + Step: 1 * time.Minute, + }) + require.NoError(t, err, "failed to query Prometheus metrics, query:", query) + results = append(results, wrappedQueryRangeResponse{ + NodeName: node.Name, + QueryRangeResponse: *queryResponse, + }) + } + } + + require.NoError(t, saveJSONFile(metric.filename, results), "failed to save JSON file for metric:", metric.filename) + testLogger.Info().Str("filename", metric.filename).Msg("Saved JSON file for metric") + } +} + +func saveJSONFile(path string, v any) error { + if dir := filepath.Dir(path); dir != "" && dir != "." { + if err := os.MkdirAll(dir, 0o755); err != nil { + return fmt.Errorf("create directory for %q: %w", path, err) + } + } + + data, err := json.MarshalIndent(v, "", " ") + if err != nil { + return fmt.Errorf("marshal JSON for %q: %w", path, err) + } + if err := os.WriteFile(path, data, 0o644); err != nil { //nolint:gosec // test artifact + return fmt.Errorf("write file %q: %w", path, err) + } + return nil +} diff --git a/system-tests/tests/test-helpers/container_logs.go b/system-tests/tests/test-helpers/container_logs.go new file mode 100644 index 00000000000..776ac771285 --- /dev/null +++ b/system-tests/tests/test-helpers/container_logs.go @@ -0,0 +1,91 @@ +package helpers + +import ( + "encoding/binary" + "fmt" + "io" + "strings" + "testing" + + "github.com/moby/moby/client" + "github.com/smartcontractkit/chainlink-testing-framework/framework" + ttypes "github.com/smartcontractkit/chainlink/system-tests/tests/test-helpers/configuration" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func AssertNodeLogs(t *testing.T, testEnv *ttypes.TestEnvironment, needle string) { + t.Helper() + + targetNames := make(map[string]struct{}) + for _, nodeSet := range testEnv.Config.NodeSets { + if nodeSet.Out == nil { + continue + } + for _, clNode := range nodeSet.Out.CLNodes { + name := clNode.Node.ContainerName + if name != "" { + targetNames[name] = struct{}{} + } + } + } + require.NotEmpty(t, targetNames, "no container names found in test environment") + + logStreams, err := framework.StreamContainerLogs( + client.ContainerListOptions{All: true}, + client.ContainerLogsOptions{ShowStdout: true, ShowStderr: true}, + ) + require.NoError(t, err) + + found := false + for containerName, reader := range logStreams { + if _, ok := targetNames[containerName]; !ok { + _ = reader.Close() + continue + } + content, readErr := readContainerLogs(reader) + if readErr != nil { + framework.L.Warn().Str("container", containerName).Err(readErr).Msg("could not read container logs") + continue + } + if strings.Contains(content, needle) { + found = true + framework.L.Info().Str("container", containerName).Msg("confirmed: " + needle) + } + } + assert.True(t, found, "expected at least one node container log to contain %q", needle) +} + +// readContainerLogs decodes a Docker multiplexed log stream into plain text. +// framework.StreamContainerLogs returns this format; the framework decoder is not exported. +func readContainerLogs(r io.ReadCloser) (string, error) { + defer func() { _ = r.Close() }() + + var buf strings.Builder + if err := decodeDockerLogStream(&buf, r); err != nil { + return "", err + } + return buf.String(), nil +} + +func decodeDockerLogStream(dst io.Writer, r io.Reader) error { + header := make([]byte, 8) + for { + _, err := io.ReadFull(r, header) + if err == io.EOF { + return nil + } + if err != nil { + return fmt.Errorf("read log stream header: %w", err) + } + + msgSize := binary.BigEndian.Uint32(header[4:8]) + msg := make([]byte, msgSize) + if _, err = io.ReadFull(r, msg); err != nil { + return fmt.Errorf("read log message: %w", err) + } + if _, err = dst.Write(msg); err != nil { + return fmt.Errorf("write log message: %w", err) + } + } +} From 1913eab405e1e20a6161d85e005057b26dfdc9cb Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Tue, 19 May 2026 11:46:08 +0200 Subject: [PATCH 02/13] use newer CTF --- core/scripts/go.mod | 2 +- core/scripts/go.sum | 4 ++-- system-tests/lib/go.mod | 2 +- system-tests/lib/go.sum | 4 ++-- system-tests/tests/go.mod | 4 ++-- system-tests/tests/go.sum | 4 ++-- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/core/scripts/go.mod b/core/scripts/go.mod index 1bb2a6bd47e..52be93b72e3 100644 --- a/core/scripts/go.mod +++ b/core/scripts/go.mod @@ -51,7 +51,7 @@ require ( github.com/smartcontractkit/chainlink-evm/gethwrappers v0.0.0-20260421142741-9c7fbaf7c828 github.com/smartcontractkit/chainlink-protos/cre/go v0.0.0-20260514104516-a827acdffe43 github.com/smartcontractkit/chainlink-protos/job-distributor v0.18.0 - github.com/smartcontractkit/chainlink-testing-framework/framework v0.16.1 + github.com/smartcontractkit/chainlink-testing-framework/framework v0.16.2-0.20260519094446-27aa1c0d9ed5 github.com/smartcontractkit/chainlink-testing-framework/framework/components/dockercompose v0.1.23 github.com/smartcontractkit/chainlink-testing-framework/seth v1.51.5 github.com/smartcontractkit/chainlink/core/scripts/cre/environment/examples/workflows/proof-of-reserve/cron-based v0.0.0-00010101000000-000000000000 diff --git a/core/scripts/go.sum b/core/scripts/go.sum index fc61f639f05..40a0a451734 100644 --- a/core/scripts/go.sum +++ b/core/scripts/go.sum @@ -1709,8 +1709,8 @@ github.com/smartcontractkit/chainlink-sui v0.0.0-20260429183453-39df0198aed8 h1: github.com/smartcontractkit/chainlink-sui v0.0.0-20260429183453-39df0198aed8/go.mod h1:k1HSbHyPaQWPOj6lXDIAe04EuwbC5ge1nK+cpG2E8hE= github.com/smartcontractkit/chainlink-sui/deployment v0.0.0-20260427132612-76b9f754a556 h1:6ocsoNPu3T0LsBiZ1tGZrjhKu8pGC1opUFz5KgHALSU= github.com/smartcontractkit/chainlink-sui/deployment v0.0.0-20260427132612-76b9f754a556/go.mod h1:LkUo0a46JWaCsLY4SCV5ZOESudehe2RR62C1S46iOqw= -github.com/smartcontractkit/chainlink-testing-framework/framework v0.16.1 h1:wZd5hIQRcQaq3FgW1lg/4ilk68Id6cxYKFNU9iTnugs= -github.com/smartcontractkit/chainlink-testing-framework/framework v0.16.1/go.mod h1:wxgGfrJpzIdC1wyMJEGOfN4H4yPQTZD/DdrMRBxA0io= +github.com/smartcontractkit/chainlink-testing-framework/framework v0.16.2-0.20260519094446-27aa1c0d9ed5 h1:Ef1lQHUH0UZNYSo2bUWJLL/Bjl6sBTdPZH+B6zeQvTQ= +github.com/smartcontractkit/chainlink-testing-framework/framework v0.16.2-0.20260519094446-27aa1c0d9ed5/go.mod h1:wxgGfrJpzIdC1wyMJEGOfN4H4yPQTZD/DdrMRBxA0io= github.com/smartcontractkit/chainlink-testing-framework/framework/components/chiprouter v1.0.4 h1:bGicxBPndwy9NeB79n+CgyNxA8aeWoMudC84krz6QGM= github.com/smartcontractkit/chainlink-testing-framework/framework/components/chiprouter v1.0.4/go.mod h1:TsZMdVIPeIBzFwVIUmU7jkXOTHSpyvCJGeLtjuBxa8E= github.com/smartcontractkit/chainlink-testing-framework/framework/components/dockercompose v0.1.23 h1:FYZZ2U6h2y4sITrEyTKPHTzjJrrsqCqN3zGqkpk7p3s= diff --git a/system-tests/lib/go.mod b/system-tests/lib/go.mod index ae857f8286f..5aceb7cca9d 100644 --- a/system-tests/lib/go.mod +++ b/system-tests/lib/go.mod @@ -42,7 +42,7 @@ require ( github.com/smartcontractkit/chainlink-protos/job-distributor v0.18.0 github.com/smartcontractkit/chainlink-protos/linking-service/go v0.0.0-20251002192024-d2ad9222409b github.com/smartcontractkit/chainlink-protos/workflows/go v0.0.0-20260323124644-faea187e6997 - github.com/smartcontractkit/chainlink-testing-framework/framework v0.16.1 + github.com/smartcontractkit/chainlink-testing-framework/framework v0.16.2-0.20260519094446-27aa1c0d9ed5 github.com/smartcontractkit/chainlink-testing-framework/framework/components/chiprouter v1.0.4 github.com/smartcontractkit/chainlink-testing-framework/framework/components/dockercompose v0.1.23 github.com/smartcontractkit/chainlink-testing-framework/framework/components/fake v0.15.0 diff --git a/system-tests/lib/go.sum b/system-tests/lib/go.sum index 08628962d00..a1706c2f603 100644 --- a/system-tests/lib/go.sum +++ b/system-tests/lib/go.sum @@ -1678,8 +1678,8 @@ github.com/smartcontractkit/chainlink-sui v0.0.0-20260429183453-39df0198aed8 h1: github.com/smartcontractkit/chainlink-sui v0.0.0-20260429183453-39df0198aed8/go.mod h1:k1HSbHyPaQWPOj6lXDIAe04EuwbC5ge1nK+cpG2E8hE= github.com/smartcontractkit/chainlink-sui/deployment v0.0.0-20260427132612-76b9f754a556 h1:6ocsoNPu3T0LsBiZ1tGZrjhKu8pGC1opUFz5KgHALSU= github.com/smartcontractkit/chainlink-sui/deployment v0.0.0-20260427132612-76b9f754a556/go.mod h1:LkUo0a46JWaCsLY4SCV5ZOESudehe2RR62C1S46iOqw= -github.com/smartcontractkit/chainlink-testing-framework/framework v0.16.1 h1:wZd5hIQRcQaq3FgW1lg/4ilk68Id6cxYKFNU9iTnugs= -github.com/smartcontractkit/chainlink-testing-framework/framework v0.16.1/go.mod h1:wxgGfrJpzIdC1wyMJEGOfN4H4yPQTZD/DdrMRBxA0io= +github.com/smartcontractkit/chainlink-testing-framework/framework v0.16.2-0.20260519094446-27aa1c0d9ed5 h1:Ef1lQHUH0UZNYSo2bUWJLL/Bjl6sBTdPZH+B6zeQvTQ= +github.com/smartcontractkit/chainlink-testing-framework/framework v0.16.2-0.20260519094446-27aa1c0d9ed5/go.mod h1:wxgGfrJpzIdC1wyMJEGOfN4H4yPQTZD/DdrMRBxA0io= github.com/smartcontractkit/chainlink-testing-framework/framework/components/chiprouter v1.0.4 h1:bGicxBPndwy9NeB79n+CgyNxA8aeWoMudC84krz6QGM= github.com/smartcontractkit/chainlink-testing-framework/framework/components/chiprouter v1.0.4/go.mod h1:TsZMdVIPeIBzFwVIUmU7jkXOTHSpyvCJGeLtjuBxa8E= github.com/smartcontractkit/chainlink-testing-framework/framework/components/dockercompose v0.1.23 h1:FYZZ2U6h2y4sITrEyTKPHTzjJrrsqCqN3zGqkpk7p3s= diff --git a/system-tests/tests/go.mod b/system-tests/tests/go.mod index fde855bdd96..8905bcd64f8 100644 --- a/system-tests/tests/go.mod +++ b/system-tests/tests/go.mod @@ -51,6 +51,7 @@ require ( github.com/gin-gonic/gin v1.10.1 github.com/google/uuid v1.6.0 github.com/lib/pq v1.11.1 + github.com/moby/moby/client v0.4.1 github.com/pkg/errors v0.9.1 github.com/rs/zerolog v1.34.0 github.com/smartcontractkit/chain-selectors v1.0.100 @@ -62,7 +63,7 @@ require ( github.com/smartcontractkit/chainlink-protos/cre/go v0.0.0-20260514104516-a827acdffe43 github.com/smartcontractkit/chainlink-protos/ring/go v0.0.0-20260331131315-f08a616d8dcd github.com/smartcontractkit/chainlink-protos/workflows/go v0.0.0-20260323124644-faea187e6997 - github.com/smartcontractkit/chainlink-testing-framework/framework v0.16.1 + github.com/smartcontractkit/chainlink-testing-framework/framework v0.16.2-0.20260519094446-27aa1c0d9ed5 github.com/smartcontractkit/chainlink-testing-framework/framework/components/chiprouter v1.0.4 github.com/smartcontractkit/chainlink-testing-framework/framework/components/fake v0.15.0 github.com/smartcontractkit/chainlink-testing-framework/seth v1.51.5 @@ -127,7 +128,6 @@ require ( github.com/klauspost/crc32 v1.3.0 // indirect github.com/mfridman/interpolate v0.0.2 // indirect github.com/moby/moby/api v1.54.2 // indirect - github.com/moby/moby/client v0.4.1 // indirect github.com/oapi-codegen/runtime v1.1.2 // indirect github.com/pressly/goose/v3 v3.27.0 // indirect github.com/prometheus/common v0.67.5 // indirect diff --git a/system-tests/tests/go.sum b/system-tests/tests/go.sum index 46516f201f6..a0fc5fe8995 100644 --- a/system-tests/tests/go.sum +++ b/system-tests/tests/go.sum @@ -1691,8 +1691,8 @@ github.com/smartcontractkit/chainlink-sui v0.0.0-20260429183453-39df0198aed8 h1: github.com/smartcontractkit/chainlink-sui v0.0.0-20260429183453-39df0198aed8/go.mod h1:k1HSbHyPaQWPOj6lXDIAe04EuwbC5ge1nK+cpG2E8hE= github.com/smartcontractkit/chainlink-sui/deployment v0.0.0-20260427132612-76b9f754a556 h1:6ocsoNPu3T0LsBiZ1tGZrjhKu8pGC1opUFz5KgHALSU= github.com/smartcontractkit/chainlink-sui/deployment v0.0.0-20260427132612-76b9f754a556/go.mod h1:LkUo0a46JWaCsLY4SCV5ZOESudehe2RR62C1S46iOqw= -github.com/smartcontractkit/chainlink-testing-framework/framework v0.16.1 h1:wZd5hIQRcQaq3FgW1lg/4ilk68Id6cxYKFNU9iTnugs= -github.com/smartcontractkit/chainlink-testing-framework/framework v0.16.1/go.mod h1:wxgGfrJpzIdC1wyMJEGOfN4H4yPQTZD/DdrMRBxA0io= +github.com/smartcontractkit/chainlink-testing-framework/framework v0.16.2-0.20260519094446-27aa1c0d9ed5 h1:Ef1lQHUH0UZNYSo2bUWJLL/Bjl6sBTdPZH+B6zeQvTQ= +github.com/smartcontractkit/chainlink-testing-framework/framework v0.16.2-0.20260519094446-27aa1c0d9ed5/go.mod h1:wxgGfrJpzIdC1wyMJEGOfN4H4yPQTZD/DdrMRBxA0io= github.com/smartcontractkit/chainlink-testing-framework/framework/components/chiprouter v1.0.4 h1:bGicxBPndwy9NeB79n+CgyNxA8aeWoMudC84krz6QGM= github.com/smartcontractkit/chainlink-testing-framework/framework/components/chiprouter v1.0.4/go.mod h1:TsZMdVIPeIBzFwVIUmU7jkXOTHSpyvCJGeLtjuBxa8E= github.com/smartcontractkit/chainlink-testing-framework/framework/components/dockercompose v0.1.23 h1:FYZZ2U6h2y4sITrEyTKPHTzjJrrsqCqN3zGqkpk7p3s= From 721adf1fdc7ada91a8604f0ad139169bd2bed188 Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Tue, 19 May 2026 11:59:21 +0200 Subject: [PATCH 03/13] debug --- system-tests/tests/soak/cre/cache_test.go | 31 ++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/system-tests/tests/soak/cre/cache_test.go b/system-tests/tests/soak/cre/cache_test.go index 92666f02e98..c89ce302c03 100644 --- a/system-tests/tests/soak/cre/cache_test.go +++ b/system-tests/tests/soak/cre/cache_test.go @@ -70,12 +70,30 @@ func Test_V2_CRE_CacheSoak(t *testing.T) { t_helpers.WatchWorkflowLogs(t, testLogger, userLogsCh, baseMessageCh, t_helpers.WorkflowEngineInitErrorLog, "Amazing workflow user log", 2*time.Minute) testLogger.Info().Dur("window", cacheObserveWindow).Msg("First workflow execution confirmed, observing cache activity...") - // t_helpers.AssertNodeLogs(t, testEnv, "Module cache enabled") + t_helpers.AssertNodeLogs(t, testEnv, "Module cache enabled") + + testLogger.Info().Dur("window", cacheObserveWindow).Msg("Observing cache activity...") + observeUntil := time.Now().Add(cacheObserveWindow) + for time.Now().Before(observeUntil) { + time.Sleep(30 * time.Second) + testLogger.Info().Dur("remaining", time.Until(observeUntil).Round(time.Second)).Msg("Cache observe progress") + } + testLogger.Info().Msg("Cache observe window complete") endTime := time.Now() // Check Prometheus metrics pc := framework.NewPrometheusQueryClient(framework.LocalPrometheusBaseURL) + r, err := pc.QueryRange(framework.QueryRangeParams{ + Query: "{__name__=~\"platform_workflow_module_cache.*\"}", + Start: startTime.Add(-2 * time.Hour), + End: endTime.Add(2 * time.Hour), + Step: 1 * time.Minute, + }) + require.NoError(t, err, "failed to query available metrics") + fmt.Printf("available metrics: %+v\n", r) + fmt.Println("--------------------------------") + workflowDONs := testEnv.Dons.DonsWithFlag(cre.WorkflowDON) require.NotEmpty(t, workflowDONs, "no workflow DONs found") @@ -121,6 +139,17 @@ func Test_V2_CRE_CacheSoak(t *testing.T) { NodeName: node.Name, QueryRangeResponse: *queryResponse, }) + fmt.Printf("results: %+v\n", results) + fmt.Println("--------------------------------") + + qr, err := pc.QueryRange(framework.QueryRangeParams{ + Query: query, + Start: startTime.Add(-2 * time.Hour), + End: endTime.Add(2 * time.Hour), + Step: 1 * time.Minute, + }) + require.NoError(t, err, "failed to query Prometheus metrics, query:", query) + fmt.Printf("qr with 2h window: %+v\n", qr) } } From 9c9cf1e2b0cfd02e5f1d0f4fb410b1b4dc2025b9 Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Tue, 19 May 2026 14:33:13 +0200 Subject: [PATCH 04/13] clean up + lints --- .github/workflows/post-docker-build.yml | 9 ++++ .../workflow-gateway-don-cache-test.toml | 2 +- system-tests/tests/soak/cre/cache_test.go | 50 ++++++++----------- .../tests/test-helpers/container_logs.go | 8 +-- 4 files changed, 36 insertions(+), 33 deletions(-) diff --git a/.github/workflows/post-docker-build.yml b/.github/workflows/post-docker-build.yml index ebac65c4922..f62573a4a7b 100644 --- a/.github/workflows/post-docker-build.yml +++ b/.github/workflows/post-docker-build.yml @@ -62,3 +62,12 @@ jobs: with: chainlink_image_tag: ${{ inputs.chainlink_core_image_tag }} secrets: inherit + + call-cre-workflow-caching-tests: + permissions: + contents: read + id-token: write + uses: ./.github/workflows/cre-wf-caching-test.yml + with: + chainlink_image_tag: ${{ inputs.chainlink_core_image_tag }} + secrets: inherit diff --git a/core/scripts/cre/environment/configs/workflow-gateway-don-cache-test.toml b/core/scripts/cre/environment/configs/workflow-gateway-don-cache-test.toml index fd39d8e1267..cedaec117c9 100644 --- a/core/scripts/cre/environment/configs/workflow-gateway-don-cache-test.toml +++ b/core/scripts/cre/environment/configs/workflow-gateway-don-cache-test.toml @@ -53,7 +53,7 @@ Enabled = true MaxLoaded = 1 IdleEviction = true -IdleTimeout = '30s' +IdleTimeout = '45s' """ [[nodesets]] diff --git a/system-tests/tests/soak/cre/cache_test.go b/system-tests/tests/soak/cre/cache_test.go index c89ce302c03..b04314c6858 100644 --- a/system-tests/tests/soak/cre/cache_test.go +++ b/system-tests/tests/soak/cre/cache_test.go @@ -4,16 +4,18 @@ import ( "context" "encoding/json" "fmt" + "math" "os" "path/filepath" "strconv" "testing" "time" + "github.com/stretchr/testify/require" + commonevents "github.com/smartcontractkit/chainlink-protos/workflows/go/common" workflowevents "github.com/smartcontractkit/chainlink-protos/workflows/go/events" "github.com/smartcontractkit/chainlink-testing-framework/framework" - "github.com/stretchr/testify/require" crontypes "github.com/smartcontractkit/chainlink/core/scripts/cre/environment/examples/workflows/cron/types" @@ -22,7 +24,7 @@ import ( ) func Test_V2_CRE_CacheSoak(t *testing.T) { - numWorkflows := 10 + numWorkflows := 20 if os.Getenv("CRE_SOAK_NUM_WORKFLOWS") != "" { var err error numWorkflows, err = strconv.Atoi(os.Getenv("CRE_SOAK_NUM_WORKFLOWS")) @@ -57,12 +59,18 @@ func Test_V2_CRE_CacheSoak(t *testing.T) { workflowFileLocation := "../../../../core/scripts/cre/environment/examples/workflows/cron/main.go" - workflowConfig := crontypes.WorkflowConfig{ - Schedule: "*/30 * * * * *", - } - startTime := time.Now() for i := range numWorkflows { + var workflowConfig crontypes.WorkflowConfig + if math.Mod(float64(i), 2) == 0 { + workflowConfig = crontypes.WorkflowConfig{ + Schedule: "*/30 * * * * *", + } + } else { + workflowConfig = crontypes.WorkflowConfig{ + Schedule: "*/52 * * * * *", + } + } t_helpers.CompileAndDeployWorkflow(t, testEnv, testLogger, fmt.Sprintf("cachetest%d", i), &workflowConfig, workflowFileLocation) } testLogger.Info().Int("count", numWorkflows).Msg("All cache-test workflows deployed") @@ -76,7 +84,7 @@ func Test_V2_CRE_CacheSoak(t *testing.T) { observeUntil := time.Now().Add(cacheObserveWindow) for time.Now().Before(observeUntil) { time.Sleep(30 * time.Second) - testLogger.Info().Dur("remaining", time.Until(observeUntil).Round(time.Second)).Msg("Cache observe progress") + testLogger.Info().Dur("remaining_sec", time.Duration(time.Until(observeUntil).Round(time.Second).Seconds())).Msg("Cache observe progress") } testLogger.Info().Msg("Cache observe window complete") endTime := time.Now() @@ -84,39 +92,34 @@ func Test_V2_CRE_CacheSoak(t *testing.T) { // Check Prometheus metrics pc := framework.NewPrometheusQueryClient(framework.LocalPrometheusBaseURL) - r, err := pc.QueryRange(framework.QueryRangeParams{ - Query: "{__name__=~\"platform_workflow_module_cache.*\"}", - Start: startTime.Add(-2 * time.Hour), - End: endTime.Add(2 * time.Hour), - Step: 1 * time.Minute, - }) - require.NoError(t, err, "failed to query available metrics") - fmt.Printf("available metrics: %+v\n", r) - fmt.Println("--------------------------------") - workflowDONs := testEnv.Dons.DonsWithFlag(cre.WorkflowDON) require.NotEmpty(t, workflowDONs, "no workflow DONs found") type wrappedQueryRangeResponse struct { NodeName string `json:"node_name"` + Metric string `json:"metric"` framework.QueryRangeResponse } type metric struct { query string filename string + metric string } metrics := []metric{ { + metric: "platform_workflow_module_cache_eviction_total", query: "increase(platform_workflow_module_cache_eviction_total{node_don=\"%s\", node_index=\"%d\"}[1m])", filename: "metrics/cache_eviction_increase.json", }, { + metric: "platform_workflow_module_cache_reload_total", query: "sum by (source) (increase(platform_workflow_module_cache_reload_total{node_don=\"%s\", node_index=\"%d\"}[1m]))", filename: "metrics/cache_reload_increase.json", }, { + metric: "platform_workflow_module_cache_memory_saved_bytes", query: "increase(platform_workflow_module_cache_memory_saved_bytes{node_don=\"%s\", node_index=\"%d\"}[1m])", filename: "metrics/cache_memory_saved_bytes.json", }, @@ -127,7 +130,6 @@ func Test_V2_CRE_CacheSoak(t *testing.T) { for _, don := range workflowDONs { for _, node := range don.Nodes { query := fmt.Sprintf(metric.query, don.Name, node.Index) - fmt.Println("query:", query) queryResponse, err := pc.QueryRange(framework.QueryRangeParams{ Query: query, Start: startTime, @@ -138,18 +140,8 @@ func Test_V2_CRE_CacheSoak(t *testing.T) { results = append(results, wrappedQueryRangeResponse{ NodeName: node.Name, QueryRangeResponse: *queryResponse, + Metric: metric.metric, }) - fmt.Printf("results: %+v\n", results) - fmt.Println("--------------------------------") - - qr, err := pc.QueryRange(framework.QueryRangeParams{ - Query: query, - Start: startTime.Add(-2 * time.Hour), - End: endTime.Add(2 * time.Hour), - Step: 1 * time.Minute, - }) - require.NoError(t, err, "failed to query Prometheus metrics, query:", query) - fmt.Printf("qr with 2h window: %+v\n", qr) } } diff --git a/system-tests/tests/test-helpers/container_logs.go b/system-tests/tests/test-helpers/container_logs.go index 776ac771285..1707b9f3f7c 100644 --- a/system-tests/tests/test-helpers/container_logs.go +++ b/system-tests/tests/test-helpers/container_logs.go @@ -2,16 +2,18 @@ package helpers import ( "encoding/binary" + "errors" "fmt" "io" "strings" "testing" "github.com/moby/moby/client" - "github.com/smartcontractkit/chainlink-testing-framework/framework" - ttypes "github.com/smartcontractkit/chainlink/system-tests/tests/test-helpers/configuration" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + + "github.com/smartcontractkit/chainlink-testing-framework/framework" + ttypes "github.com/smartcontractkit/chainlink/system-tests/tests/test-helpers/configuration" ) func AssertNodeLogs(t *testing.T, testEnv *ttypes.TestEnvironment, needle string) { @@ -72,7 +74,7 @@ func decodeDockerLogStream(dst io.Writer, r io.Reader) error { header := make([]byte, 8) for { _, err := io.ReadFull(r, header) - if err == io.EOF { + if errors.Is(err, io.EOF) { return nil } if err != nil { From 13d552821a277ad5493f184cd928d06b84ed6f55 Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Tue, 19 May 2026 15:28:32 +0200 Subject: [PATCH 05/13] separate topology for caching test, save also cpu/mem metrics --- .github/workflows/cre-wf-caching-test.yml | 4 +- .../workflow-gateway-don-cache-soak-test.toml | 80 +++++++++++++++++++ .../workflow-gateway-don-cache-test.toml | 2 +- ...k_test.go => resource_consumption_test.go} | 0 ...cache_test.go => workflow_caching_test.go} | 24 +++++- 5 files changed, 106 insertions(+), 4 deletions(-) create mode 100644 core/scripts/cre/environment/configs/workflow-gateway-don-cache-soak-test.toml rename system-tests/tests/soak/cre/{soak_test.go => resource_consumption_test.go} (100%) rename system-tests/tests/soak/cre/{cache_test.go => workflow_caching_test.go} (85%) diff --git a/.github/workflows/cre-wf-caching-test.yml b/.github/workflows/cre-wf-caching-test.yml index 0de3ad3e237..319926162c2 100644 --- a/.github/workflows/cre-wf-caching-test.yml +++ b/.github/workflows/cre-wf-caching-test.yml @@ -42,7 +42,7 @@ jobs: - name: Enable S3 Cache for Self-Hosted Runners uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0 with: - metrics: cpu,network,memory,disk + metrics: cpu,network,memory,disk,io - name: Checkout uses: actions/checkout@v6 @@ -108,7 +108,7 @@ jobs: chainlink-image: "${{ secrets.QA_AWS_ACCOUNT_NUMBER }}.dkr.ecr.${{ secrets.QA_AWS_REGION }}.amazonaws.com/${{ inputs.ecr_name || 'chainlink' }}:${{ inputs.chainlink_image_tag != '' && inputs.chainlink_image_tag || 'nightly-20260519-plugins' }}" - ctf-configs: configs/workflow-gateway-don-cache-test.toml + ctf-configs: configs/workflow-gateway-don-cache-soak-test.toml chip-router-image: "${{ secrets.QA_AWS_ACCOUNT_NUMBER }}.dkr.ecr.${{secrets.QA_AWS_REGION }}.amazonaws.com/local-cre-chip-router:v1.0.1" diff --git a/core/scripts/cre/environment/configs/workflow-gateway-don-cache-soak-test.toml b/core/scripts/cre/environment/configs/workflow-gateway-don-cache-soak-test.toml new file mode 100644 index 00000000000..90a85868394 --- /dev/null +++ b/core/scripts/cre/environment/configs/workflow-gateway-don-cache-soak-test.toml @@ -0,0 +1,80 @@ +[chip_router] + image = "local-cre-chip-router:v1.0.1" + +[[blockchains]] + type = "anvil" + chain_id = "1337" + container_name = "anvil-1337" + docker_cmd_params = ["-b", "0.5", "--mixed-mining"] + +[[blockchains]] + type = "anvil" + chain_id = "2337" + container_name = "anvil-2337" + port = "8546" + docker_cmd_params = ["-b", "0.5", "--mixed-mining"] + +[jd] + csa_encryption_key = "d1093c0060d50a3c89c189b2e485da5a3ce57f3dcb38ab7e2c0d5f0bb2314a44" + image = "job-distributor:0.22.1" + +[fake] + port = 8171 + +[fake_http] + port = 8666 + +[infra] + type = "docker" + +[[nodesets]] + nodes = 4 + name = "workflow" + don_types = ["workflow"] + override_mode = "all" + http_port_range_start = 10100 + + env_vars = { CL_EVM_CMD = "" } + capabilities = ["vault", "cron", "http-action", "http-trigger", "consensus", "don-time", "evm-1337", "evm-2337"] + registry_based_launch_allowlist = ["cron-trigger@1.0.0"] + + [nodesets.db] + image = "postgres:12.0" + port = 13000 + +[[nodesets.node_specs]] + roles = ["plugin"] + [nodesets.node_specs.node] + docker_ctx = "../../../.." + docker_file = "core/chainlink.Dockerfile" + docker_build_args = { "CL_IS_PROD_BUILD" = "false" } + user_config_overrides = """ +[Capabilities.WorkflowRegistry.ModuleCache] +Enabled = true +MaxLoaded = 20 +IdleEviction = true +IdleTimeout = '45s' +""" + +[[nodesets]] + nodes = 1 + name = "bootstrap-gateway" + don_types = ["bootstrap", "gateway"] + override_mode = "each" + http_port_range_start = 10300 + + env_vars = { CL_EVM_CMD = "" } + supported_evm_chains = [1337, 2337] + + [nodesets.db] + image = "postgres:12.0" + port = 13200 + + [[nodesets.node_specs]] + roles = ["bootstrap", "gateway"] + [nodesets.node_specs.node] + docker_ctx = "../../../.." + docker_file = "core/chainlink.Dockerfile" + docker_build_args = { "CL_IS_PROD_BUILD" = "false" } + custom_ports = ["5002:5002","15002:15002"] + user_config_overrides = "" diff --git a/core/scripts/cre/environment/configs/workflow-gateway-don-cache-test.toml b/core/scripts/cre/environment/configs/workflow-gateway-don-cache-test.toml index cedaec117c9..fd39d8e1267 100644 --- a/core/scripts/cre/environment/configs/workflow-gateway-don-cache-test.toml +++ b/core/scripts/cre/environment/configs/workflow-gateway-don-cache-test.toml @@ -53,7 +53,7 @@ Enabled = true MaxLoaded = 1 IdleEviction = true -IdleTimeout = '45s' +IdleTimeout = '30s' """ [[nodesets]] diff --git a/system-tests/tests/soak/cre/soak_test.go b/system-tests/tests/soak/cre/resource_consumption_test.go similarity index 100% rename from system-tests/tests/soak/cre/soak_test.go rename to system-tests/tests/soak/cre/resource_consumption_test.go diff --git a/system-tests/tests/soak/cre/cache_test.go b/system-tests/tests/soak/cre/workflow_caching_test.go similarity index 85% rename from system-tests/tests/soak/cre/cache_test.go rename to system-tests/tests/soak/cre/workflow_caching_test.go index b04314c6858..b9af969067e 100644 --- a/system-tests/tests/soak/cre/cache_test.go +++ b/system-tests/tests/soak/cre/workflow_caching_test.go @@ -105,6 +105,7 @@ func Test_V2_CRE_CacheSoak(t *testing.T) { query string filename string metric string + step time.Duration // interval between query points } metrics := []metric{ @@ -112,16 +113,37 @@ func Test_V2_CRE_CacheSoak(t *testing.T) { metric: "platform_workflow_module_cache_eviction_total", query: "increase(platform_workflow_module_cache_eviction_total{node_don=\"%s\", node_index=\"%d\"}[1m])", filename: "metrics/cache_eviction_increase.json", + step: 1 * time.Minute, }, { metric: "platform_workflow_module_cache_reload_total", query: "sum by (source) (increase(platform_workflow_module_cache_reload_total{node_don=\"%s\", node_index=\"%d\"}[1m]))", filename: "metrics/cache_reload_increase.json", + step: 1 * time.Minute, }, { metric: "platform_workflow_module_cache_memory_saved_bytes", query: "increase(platform_workflow_module_cache_memory_saved_bytes{node_don=\"%s\", node_index=\"%d\"}[1m])", filename: "metrics/cache_memory_saved_bytes.json", + step: 1 * time.Minute, + }, + // average memory usage of the container over the last 10 minutes, unit:MBs + // queried every 5 minutes + // name is the Docker container name, this metric is gathered by cAdvisor + { + metric: "container_memory_rss", + query: "avg_over_time(container_memory_rss{name=\"%s-node%d\"}[10m]) / 1024 / 1024", + filename: "metrics/container_memory_rss.json", + step: 5 * time.Minute, + }, + // average CPU usage of the container over the last 10 minutes, unit:% + // queried every 5 minutes + // name is the Docker container name, this metric is gathered by cAdvisor + { + metric: "container_cpu_usage_seconds_total", + query: "sum(rate(container_cpu_usage_seconds_total{name=\"%s-node%d\"}[10m])) * 100", + filename: "metrics/container_cpu_usage_seconds_total.json", + step: 5 * time.Minute, }, } @@ -134,7 +156,7 @@ func Test_V2_CRE_CacheSoak(t *testing.T) { Query: query, Start: startTime, End: endTime, - Step: 1 * time.Minute, + Step: metric.step, }) require.NoError(t, err, "failed to query Prometheus metrics, query:", query) results = append(results, wrappedQueryRangeResponse{ From a6d87f34e91407fb22c2ae63e91ca79dc76b4d88 Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Tue, 19 May 2026 15:53:25 +0200 Subject: [PATCH 06/13] bump CTF --- core/scripts/go.mod | 2 +- core/scripts/go.sum | 4 ++-- system-tests/lib/go.mod | 2 +- system-tests/lib/go.sum | 4 ++-- system-tests/tests/go.mod | 2 +- system-tests/tests/go.sum | 4 ++-- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/core/scripts/go.mod b/core/scripts/go.mod index 52be93b72e3..6344705ad20 100644 --- a/core/scripts/go.mod +++ b/core/scripts/go.mod @@ -51,7 +51,7 @@ require ( github.com/smartcontractkit/chainlink-evm/gethwrappers v0.0.0-20260421142741-9c7fbaf7c828 github.com/smartcontractkit/chainlink-protos/cre/go v0.0.0-20260514104516-a827acdffe43 github.com/smartcontractkit/chainlink-protos/job-distributor v0.18.0 - github.com/smartcontractkit/chainlink-testing-framework/framework v0.16.2-0.20260519094446-27aa1c0d9ed5 + github.com/smartcontractkit/chainlink-testing-framework/framework v0.16.2 github.com/smartcontractkit/chainlink-testing-framework/framework/components/dockercompose v0.1.23 github.com/smartcontractkit/chainlink-testing-framework/seth v1.51.5 github.com/smartcontractkit/chainlink/core/scripts/cre/environment/examples/workflows/proof-of-reserve/cron-based v0.0.0-00010101000000-000000000000 diff --git a/core/scripts/go.sum b/core/scripts/go.sum index 40a0a451734..03e435a3b38 100644 --- a/core/scripts/go.sum +++ b/core/scripts/go.sum @@ -1709,8 +1709,8 @@ github.com/smartcontractkit/chainlink-sui v0.0.0-20260429183453-39df0198aed8 h1: github.com/smartcontractkit/chainlink-sui v0.0.0-20260429183453-39df0198aed8/go.mod h1:k1HSbHyPaQWPOj6lXDIAe04EuwbC5ge1nK+cpG2E8hE= github.com/smartcontractkit/chainlink-sui/deployment v0.0.0-20260427132612-76b9f754a556 h1:6ocsoNPu3T0LsBiZ1tGZrjhKu8pGC1opUFz5KgHALSU= github.com/smartcontractkit/chainlink-sui/deployment v0.0.0-20260427132612-76b9f754a556/go.mod h1:LkUo0a46JWaCsLY4SCV5ZOESudehe2RR62C1S46iOqw= -github.com/smartcontractkit/chainlink-testing-framework/framework v0.16.2-0.20260519094446-27aa1c0d9ed5 h1:Ef1lQHUH0UZNYSo2bUWJLL/Bjl6sBTdPZH+B6zeQvTQ= -github.com/smartcontractkit/chainlink-testing-framework/framework v0.16.2-0.20260519094446-27aa1c0d9ed5/go.mod h1:wxgGfrJpzIdC1wyMJEGOfN4H4yPQTZD/DdrMRBxA0io= +github.com/smartcontractkit/chainlink-testing-framework/framework v0.16.2 h1:p5zaHFFP1H05cpTl9evI4YBjSmfmljIxYbsZwmLiJxI= +github.com/smartcontractkit/chainlink-testing-framework/framework v0.16.2/go.mod h1:wxgGfrJpzIdC1wyMJEGOfN4H4yPQTZD/DdrMRBxA0io= github.com/smartcontractkit/chainlink-testing-framework/framework/components/chiprouter v1.0.4 h1:bGicxBPndwy9NeB79n+CgyNxA8aeWoMudC84krz6QGM= github.com/smartcontractkit/chainlink-testing-framework/framework/components/chiprouter v1.0.4/go.mod h1:TsZMdVIPeIBzFwVIUmU7jkXOTHSpyvCJGeLtjuBxa8E= github.com/smartcontractkit/chainlink-testing-framework/framework/components/dockercompose v0.1.23 h1:FYZZ2U6h2y4sITrEyTKPHTzjJrrsqCqN3zGqkpk7p3s= diff --git a/system-tests/lib/go.mod b/system-tests/lib/go.mod index 5aceb7cca9d..72b4fea9220 100644 --- a/system-tests/lib/go.mod +++ b/system-tests/lib/go.mod @@ -42,7 +42,7 @@ require ( github.com/smartcontractkit/chainlink-protos/job-distributor v0.18.0 github.com/smartcontractkit/chainlink-protos/linking-service/go v0.0.0-20251002192024-d2ad9222409b github.com/smartcontractkit/chainlink-protos/workflows/go v0.0.0-20260323124644-faea187e6997 - github.com/smartcontractkit/chainlink-testing-framework/framework v0.16.2-0.20260519094446-27aa1c0d9ed5 + github.com/smartcontractkit/chainlink-testing-framework/framework v0.16.2 github.com/smartcontractkit/chainlink-testing-framework/framework/components/chiprouter v1.0.4 github.com/smartcontractkit/chainlink-testing-framework/framework/components/dockercompose v0.1.23 github.com/smartcontractkit/chainlink-testing-framework/framework/components/fake v0.15.0 diff --git a/system-tests/lib/go.sum b/system-tests/lib/go.sum index a1706c2f603..99098c7e2e2 100644 --- a/system-tests/lib/go.sum +++ b/system-tests/lib/go.sum @@ -1678,8 +1678,8 @@ github.com/smartcontractkit/chainlink-sui v0.0.0-20260429183453-39df0198aed8 h1: github.com/smartcontractkit/chainlink-sui v0.0.0-20260429183453-39df0198aed8/go.mod h1:k1HSbHyPaQWPOj6lXDIAe04EuwbC5ge1nK+cpG2E8hE= github.com/smartcontractkit/chainlink-sui/deployment v0.0.0-20260427132612-76b9f754a556 h1:6ocsoNPu3T0LsBiZ1tGZrjhKu8pGC1opUFz5KgHALSU= github.com/smartcontractkit/chainlink-sui/deployment v0.0.0-20260427132612-76b9f754a556/go.mod h1:LkUo0a46JWaCsLY4SCV5ZOESudehe2RR62C1S46iOqw= -github.com/smartcontractkit/chainlink-testing-framework/framework v0.16.2-0.20260519094446-27aa1c0d9ed5 h1:Ef1lQHUH0UZNYSo2bUWJLL/Bjl6sBTdPZH+B6zeQvTQ= -github.com/smartcontractkit/chainlink-testing-framework/framework v0.16.2-0.20260519094446-27aa1c0d9ed5/go.mod h1:wxgGfrJpzIdC1wyMJEGOfN4H4yPQTZD/DdrMRBxA0io= +github.com/smartcontractkit/chainlink-testing-framework/framework v0.16.2 h1:p5zaHFFP1H05cpTl9evI4YBjSmfmljIxYbsZwmLiJxI= +github.com/smartcontractkit/chainlink-testing-framework/framework v0.16.2/go.mod h1:wxgGfrJpzIdC1wyMJEGOfN4H4yPQTZD/DdrMRBxA0io= github.com/smartcontractkit/chainlink-testing-framework/framework/components/chiprouter v1.0.4 h1:bGicxBPndwy9NeB79n+CgyNxA8aeWoMudC84krz6QGM= github.com/smartcontractkit/chainlink-testing-framework/framework/components/chiprouter v1.0.4/go.mod h1:TsZMdVIPeIBzFwVIUmU7jkXOTHSpyvCJGeLtjuBxa8E= github.com/smartcontractkit/chainlink-testing-framework/framework/components/dockercompose v0.1.23 h1:FYZZ2U6h2y4sITrEyTKPHTzjJrrsqCqN3zGqkpk7p3s= diff --git a/system-tests/tests/go.mod b/system-tests/tests/go.mod index 8905bcd64f8..7a1abb7e89e 100644 --- a/system-tests/tests/go.mod +++ b/system-tests/tests/go.mod @@ -63,7 +63,7 @@ require ( github.com/smartcontractkit/chainlink-protos/cre/go v0.0.0-20260514104516-a827acdffe43 github.com/smartcontractkit/chainlink-protos/ring/go v0.0.0-20260331131315-f08a616d8dcd github.com/smartcontractkit/chainlink-protos/workflows/go v0.0.0-20260323124644-faea187e6997 - github.com/smartcontractkit/chainlink-testing-framework/framework v0.16.2-0.20260519094446-27aa1c0d9ed5 + github.com/smartcontractkit/chainlink-testing-framework/framework v0.16.2 github.com/smartcontractkit/chainlink-testing-framework/framework/components/chiprouter v1.0.4 github.com/smartcontractkit/chainlink-testing-framework/framework/components/fake v0.15.0 github.com/smartcontractkit/chainlink-testing-framework/seth v1.51.5 diff --git a/system-tests/tests/go.sum b/system-tests/tests/go.sum index a0fc5fe8995..639396a5c97 100644 --- a/system-tests/tests/go.sum +++ b/system-tests/tests/go.sum @@ -1691,8 +1691,8 @@ github.com/smartcontractkit/chainlink-sui v0.0.0-20260429183453-39df0198aed8 h1: github.com/smartcontractkit/chainlink-sui v0.0.0-20260429183453-39df0198aed8/go.mod h1:k1HSbHyPaQWPOj6lXDIAe04EuwbC5ge1nK+cpG2E8hE= github.com/smartcontractkit/chainlink-sui/deployment v0.0.0-20260427132612-76b9f754a556 h1:6ocsoNPu3T0LsBiZ1tGZrjhKu8pGC1opUFz5KgHALSU= github.com/smartcontractkit/chainlink-sui/deployment v0.0.0-20260427132612-76b9f754a556/go.mod h1:LkUo0a46JWaCsLY4SCV5ZOESudehe2RR62C1S46iOqw= -github.com/smartcontractkit/chainlink-testing-framework/framework v0.16.2-0.20260519094446-27aa1c0d9ed5 h1:Ef1lQHUH0UZNYSo2bUWJLL/Bjl6sBTdPZH+B6zeQvTQ= -github.com/smartcontractkit/chainlink-testing-framework/framework v0.16.2-0.20260519094446-27aa1c0d9ed5/go.mod h1:wxgGfrJpzIdC1wyMJEGOfN4H4yPQTZD/DdrMRBxA0io= +github.com/smartcontractkit/chainlink-testing-framework/framework v0.16.2 h1:p5zaHFFP1H05cpTl9evI4YBjSmfmljIxYbsZwmLiJxI= +github.com/smartcontractkit/chainlink-testing-framework/framework v0.16.2/go.mod h1:wxgGfrJpzIdC1wyMJEGOfN4H4yPQTZD/DdrMRBxA0io= github.com/smartcontractkit/chainlink-testing-framework/framework/components/chiprouter v1.0.4 h1:bGicxBPndwy9NeB79n+CgyNxA8aeWoMudC84krz6QGM= github.com/smartcontractkit/chainlink-testing-framework/framework/components/chiprouter v1.0.4/go.mod h1:TsZMdVIPeIBzFwVIUmU7jkXOTHSpyvCJGeLtjuBxa8E= github.com/smartcontractkit/chainlink-testing-framework/framework/components/dockercompose v0.1.23 h1:FYZZ2U6h2y4sITrEyTKPHTzjJrrsqCqN3zGqkpk7p3s= From 4657bc83bb14e36dedfe7fd6f7448433a77ef7f4 Mon Sep 17 00:00:00 2001 From: mchain0 Date: Tue, 19 May 2026 17:17:01 +0200 Subject: [PATCH 07/13] cre-4396: soak test scenario adjustments; extra metrics; config and limits adjustments --- .github/workflows/cre-wf-caching-test.yml | 3 +- .../workflow-gateway-don-cache-soak-test.toml | 10 +- .../tests/soak/cre/workflow_caching_test.go | 146 ++++++++++++++---- 3 files changed, 122 insertions(+), 37 deletions(-) diff --git a/.github/workflows/cre-wf-caching-test.yml b/.github/workflows/cre-wf-caching-test.yml index 319926162c2..c4b78294128 100644 --- a/.github/workflows/cre-wf-caching-test.yml +++ b/.github/workflows/cre-wf-caching-test.yml @@ -135,8 +135,7 @@ jobs: secrets.QA_AWS_REGION }}.amazonaws.com/atlas-chip-config:7b4e9ee68fd1c737dd3480b5a3ced018\ 8f29b969" - CRE_SOAK_CACHE_OBSERVE_WINDOW: "10m" - CRE_SOAK_NUM_WORKFLOWS: "20" + CRE_SOAK_DURATION: "4h" # TODO - adjust test timeout to actual observation time run: | gotestsum \ diff --git a/core/scripts/cre/environment/configs/workflow-gateway-don-cache-soak-test.toml b/core/scripts/cre/environment/configs/workflow-gateway-don-cache-soak-test.toml index 90a85868394..1285e633be1 100644 --- a/core/scripts/cre/environment/configs/workflow-gateway-don-cache-soak-test.toml +++ b/core/scripts/cre/environment/configs/workflow-gateway-don-cache-soak-test.toml @@ -51,9 +51,15 @@ user_config_overrides = """ [Capabilities.WorkflowRegistry.ModuleCache] Enabled = true -MaxLoaded = 20 +# Sized for ~50% of 128GiB host at cap; per loaded slot ~= WASMBinarySizeLimit + WASMMemoryLimit (100MiB each by default). +MaxLoaded = 80 IdleEviction = true -IdleTimeout = '45s' +# Long enough that 3m fast cron stays hot and 8m slow cron idles out between runs. +IdleTimeout = '5m' + +[Workflows.Limits] +Global = 5000 +PerOwner = 5000 """ [[nodesets]] diff --git a/system-tests/tests/soak/cre/workflow_caching_test.go b/system-tests/tests/soak/cre/workflow_caching_test.go index b9af969067e..d191d02ab05 100644 --- a/system-tests/tests/soak/cre/workflow_caching_test.go +++ b/system-tests/tests/soak/cre/workflow_caching_test.go @@ -4,7 +4,6 @@ import ( "context" "encoding/json" "fmt" - "math" "os" "path/filepath" "strconv" @@ -13,6 +12,8 @@ import ( "github.com/stretchr/testify/require" + "github.com/smartcontractkit/chainlink-common/pkg/config" + "github.com/smartcontractkit/chainlink-common/pkg/settings/cresettings" commonevents "github.com/smartcontractkit/chainlink-protos/workflows/go/common" workflowevents "github.com/smartcontractkit/chainlink-protos/workflows/go/events" "github.com/smartcontractkit/chainlink-testing-framework/framework" @@ -23,8 +24,42 @@ import ( t_helpers "github.com/smartcontractkit/chainlink/system-tests/tests/test-helpers" ) +// Memory sizing targets ~50% of a 128GiB soak host for concurrently loaded WASM modules across the +// 4-node workflow DON (see workflow-gateway-don-cache-soak-test.toml nodes + MaxLoaded). +// moduleCacheMaxLoaded is per node; _defaultSoakNumWorkflows exceeds cap slots (~20%) +// so enforceCap stays active after workflows are being triggered across nodes. +const ( + capPressurePercent = 120 + moduleCacheMaxLoaded = 80 // mirrors workflow-gateway-don-cache-soak-test.toml MaxLoaded + + moduleCacheIdleTimeout = 5 * time.Minute + fastCronInterval = 3 * time.Minute + slowCronInterval = 8 * time.Minute + + defaultSoakDuration = 4 * time.Hour + defaultMetricStep = 1 * time.Minute + cachePrometheusRange = "5m" // increase() window; align with defaultMetricStep + soakProgressLogInterval = 5 * time.Minute + deployProgressInterval = 25 + + // One of every cacheSoakSchedulePeriod workflows uses slowCronInterval (~1/3 idle-eviction tier). + cacheSoakSchedulePeriod = 3 +) + +var ( + _workflowModuleMiB = crePerWorkflowSizeLimitMiB( + cresettings.Default.PerWorkflow.WASMBinarySizeLimit.DefaultValue, + ) + _workflowEngineOverheadMiB = crePerWorkflowSizeLimitMiB( + cresettings.Default.PerWorkflow.WASMMemoryLimit.DefaultValue, + ) + _defaultSoakNumWorkflows = moduleCacheMaxLoaded * capPressurePercent / 100 +) + +// Cron timing (below) keeps cap vs idle eviction visible in 5m Prometheus buckets; schedules are staggered. + func Test_V2_CRE_CacheSoak(t *testing.T) { - numWorkflows := 20 + numWorkflows := _defaultSoakNumWorkflows if os.Getenv("CRE_SOAK_NUM_WORKFLOWS") != "" { var err error numWorkflows, err = strconv.Atoi(os.Getenv("CRE_SOAK_NUM_WORKFLOWS")) @@ -33,16 +68,13 @@ func Test_V2_CRE_CacheSoak(t *testing.T) { } } - cacheObserveWindow := 2 * time.Minute - if os.Getenv("CRE_SOAK_CACHE_OBSERVE_WINDOW") != "" { - var err error - cacheObserveWindow, err = time.ParseDuration(os.Getenv("CRE_SOAK_CACHE_OBSERVE_WINDOW")) - if err != nil { - t.Fatalf("failed to parse CRE_SOAK_CACHE_OBSERVE_WINDOW: %v", err) - } + soakDuration := parseDuration(os.Getenv("CRE_SOAK_DURATION"), defaultSoakDuration) + // CRE_SOAK_CACHE_OBSERVE_WINDOW is a legacy alias for CRE_SOAK_DURATION. + if v := os.Getenv("CRE_SOAK_CACHE_OBSERVE_WINDOW"); v != "" { + soakDuration = parseDuration(v, soakDuration) } - testEnv := t_helpers.SetupTestEnvironmentWithConfig(t, t_helpers.GetTestConfig(t, "/configs/workflow-gateway-don-cache-test.toml")) + testEnv := t_helpers.SetupTestEnvironmentWithConfig(t, t_helpers.GetTestConfig(t, "/configs/workflow-gateway-don-cache-soak-test.toml")) testLogger := framework.L userLogsCh := make(chan *workflowevents.UserLogs, 1000) @@ -60,33 +92,40 @@ func Test_V2_CRE_CacheSoak(t *testing.T) { workflowFileLocation := "../../../../core/scripts/cre/environment/examples/workflows/cron/main.go" startTime := time.Now() + testLogger.Info(). + Int("max_loaded_per_node", moduleCacheMaxLoaded). + Int("target_workflows", numWorkflows). + Int("target_loaded_mib", moduleCacheMaxLoaded*(_workflowModuleMiB+_workflowEngineOverheadMiB)). + Msg("Deploying cache soak workflows") for i := range numWorkflows { - var workflowConfig crontypes.WorkflowConfig - if math.Mod(float64(i), 2) == 0 { - workflowConfig = crontypes.WorkflowConfig{ - Schedule: "*/30 * * * * *", - } - } else { - workflowConfig = crontypes.WorkflowConfig{ - Schedule: "*/52 * * * * *", - } + workflowConfig := crontypes.WorkflowConfig{ + Schedule: cacheSoakWorkflowSchedule(i), } t_helpers.CompileAndDeployWorkflow(t, testEnv, testLogger, fmt.Sprintf("cachetest%d", i), &workflowConfig, workflowFileLocation) + if (i+1)%deployProgressInterval == 0 || i+1 == numWorkflows { + testLogger.Info().Int("deployed", i+1).Int("total", numWorkflows).Msg("Cache soak deploy progress") + } } testLogger.Info().Int("count", numWorkflows).Msg("All cache-test workflows deployed") t_helpers.WatchWorkflowLogs(t, testLogger, userLogsCh, baseMessageCh, t_helpers.WorkflowEngineInitErrorLog, "Amazing workflow user log", 2*time.Minute) - testLogger.Info().Dur("window", cacheObserveWindow).Msg("First workflow execution confirmed, observing cache activity...") + testLogger.Info().Dur("duration", soakDuration).Msg("First workflow execution confirmed, running cache soak...") t_helpers.AssertNodeLogs(t, testEnv, "Module cache enabled") - testLogger.Info().Dur("window", cacheObserveWindow).Msg("Observing cache activity...") - observeUntil := time.Now().Add(cacheObserveWindow) + testLogger.Info(). + Dur("duration", soakDuration). + Dur("fast_interval", fastCronInterval). + Dur("slow_interval", slowCronInterval). + Dur("idle_timeout", moduleCacheIdleTimeout). + Int("workflows", numWorkflows). + Msg("Observing cache activity") + observeUntil := time.Now().Add(soakDuration) for time.Now().Before(observeUntil) { - time.Sleep(30 * time.Second) - testLogger.Info().Dur("remaining_sec", time.Duration(time.Until(observeUntil).Round(time.Second).Seconds())).Msg("Cache observe progress") + time.Sleep(soakProgressLogInterval) + testLogger.Info().Dur("remaining", time.Until(observeUntil).Round(time.Second)).Msg("Cache soak progress") } - testLogger.Info().Msg("Cache observe window complete") + testLogger.Info().Msg("Cache soak complete") endTime := time.Now() // Check Prometheus metrics @@ -109,23 +148,47 @@ func Test_V2_CRE_CacheSoak(t *testing.T) { } metrics := []metric{ + { + metric: "platform_workflow_module_cache_reload_total", + query: fmt.Sprintf("sum by (source) (increase(platform_workflow_module_cache_reload_total{node_don=\"%%s\", node_index=\"%%d\"}[%s]))", cachePrometheusRange), + filename: "metrics/cache_reload_increase.json", + step: defaultMetricStep, + }, { metric: "platform_workflow_module_cache_eviction_total", - query: "increase(platform_workflow_module_cache_eviction_total{node_don=\"%s\", node_index=\"%d\"}[1m])", + query: fmt.Sprintf("increase(platform_workflow_module_cache_eviction_total{node_don=\"%%s\", node_index=\"%%d\"}[%s])", cachePrometheusRange), filename: "metrics/cache_eviction_increase.json", - step: 1 * time.Minute, + step: defaultMetricStep, }, { - metric: "platform_workflow_module_cache_reload_total", - query: "sum by (source) (increase(platform_workflow_module_cache_reload_total{node_don=\"%s\", node_index=\"%d\"}[1m]))", - filename: "metrics/cache_reload_increase.json", - step: 1 * time.Minute, + metric: "platform_workflow_module_cache_loaded", + query: fmt.Sprintf("increase(platform_workflow_module_cache_loaded{node_don=\"%%s\", node_index=\"%%d\"}[%s])", cachePrometheusRange), + filename: "metrics/cache_loaded.json", + step: defaultMetricStep, }, { metric: "platform_workflow_module_cache_memory_saved_bytes", - query: "increase(platform_workflow_module_cache_memory_saved_bytes{node_don=\"%s\", node_index=\"%d\"}[1m])", + query: fmt.Sprintf("increase(platform_workflow_module_cache_memory_saved_bytes{node_don=\"%%s\", node_index=\"%%d\"}[%s])", cachePrometheusRange), filename: "metrics/cache_memory_saved_bytes.json", - step: 1 * time.Minute, + step: defaultMetricStep, + }, + { + metric: "platform_workflow_module_cache_version_mismatch_total", + query: fmt.Sprintf("increase(platform_workflow_module_cache_version_mismatch_total{node_don=\"%%s\", node_index=\"%%d\"}[%s])", cachePrometheusRange), + filename: "metrics/cache_version_mismatch.json", + step: defaultMetricStep, + }, + { + metric: "platform_workflow_module_cache_pin_exhausted_total", + query: fmt.Sprintf("increase(platform_workflow_module_cache_pin_exhausted_total{node_don=\"%%s\", node_index=\"%%d\"}[%s])", cachePrometheusRange), + filename: "metrics/cache_pin_exhausted.json", + step: defaultMetricStep, + }, + { + metric: "platform_workflow_module_cache_try_acquire_exhausted_total", + query: fmt.Sprintf("increase(platform_workflow_module_cache_try_acquire_exhausted_total{node_don=\"%%s\", node_index=\"%%d\"}[%s])", cachePrometheusRange), + filename: "metrics/cache_try_acquire_exhausted.json", + step: defaultMetricStep, }, // average memory usage of the container over the last 10 minutes, unit:MBs // queried every 5 minutes @@ -172,6 +235,23 @@ func Test_V2_CRE_CacheSoak(t *testing.T) { } } +func crePerWorkflowSizeLimitMiB(size config.Size) int { + return int(size / config.MByte) +} + +// cacheSoakWorkflowSchedule returns a minute-granularity cron schedule aligned with the soak topology. +// Workflows with index divisible by cacheSoakSchedulePeriod use slowCronInterval (idle eviction tier); +// the rest use fastCronInterval to keep MaxLoaded full and drive cap eviction. +// Offsets stagger fires so cap and idle events land in different 5m Prometheus buckets. +func cacheSoakWorkflowSchedule(workflowIndex int) string { + if workflowIndex%cacheSoakSchedulePeriod == 0 { + offset := (workflowIndex / cacheSoakSchedulePeriod) % int(slowCronInterval.Minutes()) + return fmt.Sprintf("0 %d/%d * * * *", offset, int(slowCronInterval.Minutes())) + } + offset := workflowIndex % int(fastCronInterval.Minutes()) + return fmt.Sprintf("0 %d/%d * * * *", offset, int(fastCronInterval.Minutes())) +} + func saveJSONFile(path string, v any) error { if dir := filepath.Dir(path); dir != "" && dir != "." { if err := os.MkdirAll(dir, 0o755); err != nil { From 535629e9681fe32e155e2c7381a072748c11d85d Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Wed, 20 May 2026 06:29:21 +0200 Subject: [PATCH 08/13] clean up CI part --- .github/workflows/cre-wf-caching-test.yml | 55 +------------------ .../tests/soak/cre/workflow_caching_test.go | 4 -- 2 files changed, 1 insertion(+), 58 deletions(-) diff --git a/.github/workflows/cre-wf-caching-test.yml b/.github/workflows/cre-wf-caching-test.yml index c4b78294128..0aa1d35a921 100644 --- a/.github/workflows/cre-wf-caching-test.yml +++ b/.github/workflows/cre-wf-caching-test.yml @@ -21,8 +21,6 @@ on: chainlink_image_tag: required: true type: string - # TODO - remove after testing - pull_request: jobs: caching: @@ -32,7 +30,6 @@ jobs: deployment: false runs-on: runs-on=${{ github.run_id }}/cpu=32/ram=128/family=m6id+m6idn/spot=false/image=ubuntu24-full-x64/extras=s3-cache - # TODO Adjust to actual observation time timeout-minutes: 270 # 4h30m — test timeout is 4h20m permissions: contents: read @@ -107,7 +104,7 @@ jobs: # TODO - remove nightly image fallback after testing chainlink-image: "${{ secrets.QA_AWS_ACCOUNT_NUMBER }}.dkr.ecr.${{ secrets.QA_AWS_REGION }}.amazonaws.com/${{ inputs.ecr_name || - 'chainlink' }}:${{ inputs.chainlink_image_tag != '' && inputs.chainlink_image_tag || 'nightly-20260519-plugins' }}" + 'chainlink' }}:${{ inputs.chainlink_image_tag }}" ctf-configs: configs/workflow-gateway-don-cache-soak-test.toml chip-router-image: "${{ secrets.QA_AWS_ACCOUNT_NUMBER }}.dkr.ecr.${{secrets.QA_AWS_REGION @@ -136,7 +133,6 @@ jobs: }}.amazonaws.com/atlas-chip-config:7b4e9ee68fd1c737dd3480b5a3ced018\ 8f29b969" CRE_SOAK_DURATION: "4h" - # TODO - adjust test timeout to actual observation time run: | gotestsum \ --jsonfile=/tmp/gotest.log \ @@ -163,52 +159,3 @@ jobs: name: caching-docker-logs path: system-tests/tests/soak/cre/logs retention-days: 7 - - # notify-test-failure: - # name: Notify about test Failure - # #if: failure() - # if: false # TODO: Silence for now - # needs: [soak] - # environment: - # name: integration - # deployment: false - # runs-on: ubuntu-latest - # steps: - # - name: Send slack notification for failed resource regression tests - # id: send-slack-notification - # uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0 - # with: - # errors: "true" - # method: chat.postMessage - # token: ${{ secrets.QA_SLACK_API_KEY }} - # payload: | - # { - # "channel": "C0117GGJB6Y", - # "text": "CRE Resource Regression Test Failed. Investigation is required.", - # "blocks": [ - # { - # "type": "section", - # "text": { - # "type": "mrkdwn", - # "text": "*:rotating_light: CRE Resource Regression Test Failed. Investigation is required. :rotating_light:*" - # } - # }, - # { - # "type": "section", - # "text": { - # "type": "mrkdwn", - # "text": "Alerting <@U033CGL3CDT>, CRE Resource Regression Test failed for commit <${{ github.server_url }}/${{ github.repository }}/commit/${{ github.sha }}|${{ github.sha }}> on run ID <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|${{ github.run_id }}>. Please investigate." - # } - # }, - # { - # "type": "divider" - # }, - # { - # "type": "section", - # "text": { - # "type": "mrkdwn", - # "text": "Chainlink nodes used more resources than expected. Check the Docker logs and the alloc.pprof file." - # } - # } - # ] - # } diff --git a/system-tests/tests/soak/cre/workflow_caching_test.go b/system-tests/tests/soak/cre/workflow_caching_test.go index d191d02ab05..7a7ffa3458d 100644 --- a/system-tests/tests/soak/cre/workflow_caching_test.go +++ b/system-tests/tests/soak/cre/workflow_caching_test.go @@ -69,10 +69,6 @@ func Test_V2_CRE_CacheSoak(t *testing.T) { } soakDuration := parseDuration(os.Getenv("CRE_SOAK_DURATION"), defaultSoakDuration) - // CRE_SOAK_CACHE_OBSERVE_WINDOW is a legacy alias for CRE_SOAK_DURATION. - if v := os.Getenv("CRE_SOAK_CACHE_OBSERVE_WINDOW"); v != "" { - soakDuration = parseDuration(v, soakDuration) - } testEnv := t_helpers.SetupTestEnvironmentWithConfig(t, t_helpers.GetTestConfig(t, "/configs/workflow-gateway-don-cache-soak-test.toml")) testLogger := framework.L From 3eee1bfde5297e9e26b54b9e423260dc20ea2ed1 Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Wed, 20 May 2026 07:05:35 +0200 Subject: [PATCH 09/13] move startTime to when all workflows are deployed + check if any container was restarted/OOMed during the test --- .../tests/soak/cre/workflow_caching_test.go | 6 +- .../tests/test-helpers/container_logs.go | 27 +++++++-- .../tests/test-helpers/container_restart.go | 60 +++++++++++++++++++ 3 files changed, 86 insertions(+), 7 deletions(-) create mode 100644 system-tests/tests/test-helpers/container_restart.go diff --git a/system-tests/tests/soak/cre/workflow_caching_test.go b/system-tests/tests/soak/cre/workflow_caching_test.go index 7a7ffa3458d..238fd0959c9 100644 --- a/system-tests/tests/soak/cre/workflow_caching_test.go +++ b/system-tests/tests/soak/cre/workflow_caching_test.go @@ -87,7 +87,6 @@ func Test_V2_CRE_CacheSoak(t *testing.T) { workflowFileLocation := "../../../../core/scripts/cre/environment/examples/workflows/cron/main.go" - startTime := time.Now() testLogger.Info(). Int("max_loaded_per_node", moduleCacheMaxLoaded). Int("target_workflows", numWorkflows). @@ -103,6 +102,8 @@ func Test_V2_CRE_CacheSoak(t *testing.T) { } } testLogger.Info().Int("count", numWorkflows).Msg("All cache-test workflows deployed") + nodeContainers := t_helpers.SnapshotNodeContainerRestarts(t, testEnv) + startTime := time.Now() t_helpers.WatchWorkflowLogs(t, testLogger, userLogsCh, baseMessageCh, t_helpers.WorkflowEngineInitErrorLog, "Amazing workflow user log", 2*time.Minute) testLogger.Info().Dur("duration", soakDuration).Msg("First workflow execution confirmed, running cache soak...") @@ -229,6 +230,9 @@ func Test_V2_CRE_CacheSoak(t *testing.T) { require.NoError(t, saveJSONFile(metric.filename, results), "failed to save JSON file for metric:", metric.filename) testLogger.Info().Str("filename", metric.filename).Msg("Saved JSON file for metric") } + + t_helpers.AssertNodeContainersStable(t, nodeContainers) + testLogger.Info().Msg("Node containers stable. None was restarted or OOM-killed.") } func crePerWorkflowSizeLimitMiB(size config.Size) int { diff --git a/system-tests/tests/test-helpers/container_logs.go b/system-tests/tests/test-helpers/container_logs.go index 1707b9f3f7c..e33ba766d66 100644 --- a/system-tests/tests/test-helpers/container_logs.go +++ b/system-tests/tests/test-helpers/container_logs.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "io" + "slices" "strings" "testing" @@ -16,22 +17,36 @@ import ( ttypes "github.com/smartcontractkit/chainlink/system-tests/tests/test-helpers/configuration" ) -func AssertNodeLogs(t *testing.T, testEnv *ttypes.TestEnvironment, needle string) { +func clNodeContainerNames(t *testing.T, testEnv *ttypes.TestEnvironment) []string { t.Helper() - targetNames := make(map[string]struct{}) + names := make(map[string]struct{}) for _, nodeSet := range testEnv.Config.NodeSets { if nodeSet.Out == nil { continue } for _, clNode := range nodeSet.Out.CLNodes { - name := clNode.Node.ContainerName - if name != "" { - targetNames[name] = struct{}{} + if name := clNode.Node.ContainerName; name != "" { + names[name] = struct{}{} } } } - require.NotEmpty(t, targetNames, "no container names found in test environment") + require.NotEmpty(t, names, "no container names found in test environment") + out := make([]string, 0, len(names)) + for name := range names { + out = append(out, name) + } + slices.Sort(out) + return out +} + +func AssertNodeLogs(t *testing.T, testEnv *ttypes.TestEnvironment, needle string) { + t.Helper() + + targetNames := make(map[string]struct{}) + for _, name := range clNodeContainerNames(t, testEnv) { + targetNames[name] = struct{}{} + } logStreams, err := framework.StreamContainerLogs( client.ContainerListOptions{All: true}, diff --git a/system-tests/tests/test-helpers/container_restart.go b/system-tests/tests/test-helpers/container_restart.go new file mode 100644 index 00000000000..540a36d5e3b --- /dev/null +++ b/system-tests/tests/test-helpers/container_restart.go @@ -0,0 +1,60 @@ +package helpers + +import ( + "context" + "testing" + "time" + + dc "github.com/moby/moby/client" + "github.com/stretchr/testify/require" + + ttypes "github.com/smartcontractkit/chainlink/system-tests/tests/test-helpers/configuration" +) + +// NodeContainerSnapshot records Docker RestartCount per Chainlink node container. +type NodeContainerSnapshot map[string]int + +// SnapshotNodeContainerRestarts captures restart counts after deploy/setup is complete. +func SnapshotNodeContainerRestarts(t *testing.T, testEnv *ttypes.TestEnvironment) NodeContainerSnapshot { + t.Helper() + + client := newDockerClient(t) + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + names := clNodeContainerNames(t, testEnv) + snap := make(NodeContainerSnapshot, len(names)) + for _, name := range names { + res, err := client.ContainerInspect(ctx, name, dc.ContainerInspectOptions{}) + require.NoError(t, err, "inspect container %q", name) + snap[name] = res.Container.RestartCount + } + return snap +} + +// AssertNodeContainersStable fails if any node container restarted or was OOM-killed since the snapshot. +func AssertNodeContainersStable(t *testing.T, snap NodeContainerSnapshot) { + t.Helper() + + client := newDockerClient(t) + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + for name, wantRestarts := range snap { + res, err := client.ContainerInspect(ctx, name, dc.ContainerInspectOptions{}) + require.NoError(t, err, "inspect container %q", name) + c := res.Container + require.NotNil(t, c.State, "container %q has no state", name) + require.Equal(t, wantRestarts, c.RestartCount, "container %q restart count changed", name) + require.False(t, c.State.OOMKilled, "container %q was OOM killed", name) + require.True(t, c.State.Running, "container %q is not running (status=%s)", name, c.State.Status) + } +} + +func newDockerClient(t *testing.T) *dc.Client { + t.Helper() + client, err := dc.New(dc.FromEnv) + require.NoError(t, err, "create docker client") + t.Cleanup(func() { _ = client.Close() }) + return client +} From 00e0b77a5a42129a1ac80e64d3d8e57d123e8f71 Mon Sep 17 00:00:00 2001 From: mchain0 Date: Wed, 20 May 2026 10:22:50 +0200 Subject: [PATCH 10/13] cre-4396: more metrics; adjusted MaxLoaded; more eviction pressure; --- .../workflow-gateway-don-cache-soak-test.toml | 2 +- .../tests/soak/cre/workflow_caching_test.go | 72 +++++++++++++++++-- 2 files changed, 69 insertions(+), 5 deletions(-) diff --git a/core/scripts/cre/environment/configs/workflow-gateway-don-cache-soak-test.toml b/core/scripts/cre/environment/configs/workflow-gateway-don-cache-soak-test.toml index 1285e633be1..3b56ac64aca 100644 --- a/core/scripts/cre/environment/configs/workflow-gateway-don-cache-soak-test.toml +++ b/core/scripts/cre/environment/configs/workflow-gateway-don-cache-soak-test.toml @@ -52,7 +52,7 @@ [Capabilities.WorkflowRegistry.ModuleCache] Enabled = true # Sized for ~50% of 128GiB host at cap; per loaded slot ~= WASMBinarySizeLimit + WASMMemoryLimit (100MiB each by default). -MaxLoaded = 80 +MaxLoaded = 100 IdleEviction = true # Long enough that 3m fast cron stays hot and 8m slow cron idles out between runs. IdleTimeout = '5m' diff --git a/system-tests/tests/soak/cre/workflow_caching_test.go b/system-tests/tests/soak/cre/workflow_caching_test.go index 238fd0959c9..9226eeb53cb 100644 --- a/system-tests/tests/soak/cre/workflow_caching_test.go +++ b/system-tests/tests/soak/cre/workflow_caching_test.go @@ -29,8 +29,8 @@ import ( // moduleCacheMaxLoaded is per node; _defaultSoakNumWorkflows exceeds cap slots (~20%) // so enforceCap stays active after workflows are being triggered across nodes. const ( - capPressurePercent = 120 - moduleCacheMaxLoaded = 80 // mirrors workflow-gateway-don-cache-soak-test.toml MaxLoaded + capPressurePercent = 200 // 200% of MaxLoaded + moduleCacheMaxLoaded = 100 // mirrors workflow-gateway-don-cache-soak-test.toml MaxLoaded moduleCacheIdleTimeout = 5 * time.Minute fastCronInterval = 3 * time.Minute @@ -158,17 +158,81 @@ func Test_V2_CRE_CacheSoak(t *testing.T) { step: defaultMetricStep, }, { + // Gauge: peak loaded modules per step (cap pressure vs MaxLoaded). metric: "platform_workflow_module_cache_loaded", - query: fmt.Sprintf("increase(platform_workflow_module_cache_loaded{node_don=\"%%s\", node_index=\"%%d\"}[%s])", cachePrometheusRange), + query: fmt.Sprintf("max_over_time(platform_workflow_module_cache_loaded{node_don=\"%%s\", node_index=\"%%d\"}[%s])", cachePrometheusRange), filename: "metrics/cache_loaded.json", step: defaultMetricStep, }, { + // Gauge: average evicted-module bytes not held in RAM per step. metric: "platform_workflow_module_cache_memory_saved_bytes", - query: fmt.Sprintf("increase(platform_workflow_module_cache_memory_saved_bytes{node_don=\"%%s\", node_index=\"%%d\"}[%s])", cachePrometheusRange), + query: fmt.Sprintf("avg_over_time(platform_workflow_module_cache_memory_saved_bytes{node_don=\"%%s\", node_index=\"%%d\"}[%s])", cachePrometheusRange), filename: "metrics/cache_memory_saved_bytes.json", step: defaultMetricStep, }, + { + // Gauge: workflows fetched from registry on last sync tick (registered on this node). + metric: "platform_workflow_registry_syncer_fetched_workflows", + query: fmt.Sprintf("max_over_time(platform_workflow_registry_syncer_fetched_workflows{node_don=\"%%s\", node_index=\"%%d\"}[%s])", cachePrometheusRange), + filename: "metrics/registry_fetched_workflows.json", + step: defaultMetricStep, + }, + { + // Gauge: workflow engines currently running on this node. + metric: "platform_workflow_registry_syncer_running_workflows", + query: fmt.Sprintf("max_over_time(platform_workflow_registry_syncer_running_workflows{node_don=\"%%s\", node_index=\"%%d\"}[%s])", cachePrometheusRange), + filename: "metrics/registry_running_workflows.json", + step: defaultMetricStep, + }, + { + metric: "platform_workflow_registry_syncer_completed_syncs_total", + query: fmt.Sprintf("increase(platform_workflow_registry_syncer_completed_syncs_total{node_don=\"%%s\", node_index=\"%%d\"}[%s])", cachePrometheusRange), + filename: "metrics/registry_completed_syncs_increase.json", + step: defaultMetricStep, + }, + { + metric: "platform_workflow_registry_syncer_reconcile_events_backoff_total", + query: fmt.Sprintf("increase(platform_workflow_registry_syncer_reconcile_events_backoff_total{node_don=\"%%s\", node_index=\"%%d\"}[%s])", cachePrometheusRange), + filename: "metrics/registry_reconcile_backoff_increase.json", + step: defaultMetricStep, + }, + { + metric: "platform_engine_workflow_execution_started_count", + query: fmt.Sprintf( + "sum(increase(platform_engine_workflow_execution_started_count{node_don=\"%%s\", node_index=\"%%d\"}[%s]))", + cachePrometheusRange, + ), + filename: "metrics/engine_execution_started_increase.json", + step: defaultMetricStep, + }, + { + metric: "platform_engine_workflow_execution_succeeded_count", + query: fmt.Sprintf( + "sum(increase(platform_engine_workflow_execution_succeeded_count{node_don=\"%%s\", node_index=\"%%d\"}[%s]))", + cachePrometheusRange, + ), + filename: "metrics/engine_execution_succeeded_increase.json", + step: defaultMetricStep, + }, + { + metric: "platform_engine_workflow_execution_failed_count", + query: fmt.Sprintf( + "sum(increase(platform_engine_workflow_execution_failed_count{node_don=\"%%s\", node_index=\"%%d\"}[%s]))", + cachePrometheusRange, + ), + filename: "metrics/engine_execution_failed_increase.json", + step: defaultMetricStep, + }, + { + metric: "platform_engine_trigger_event_received_total", + query: fmt.Sprintf( + "sum(increase(platform_engine_trigger_event_received_total{node_don=\"%%s\", node_index=\"%%d\"}[%s]))", + cachePrometheusRange, + ), + filename: "metrics/engine_trigger_event_received_increase.json", + step: defaultMetricStep, + }, { metric: "platform_workflow_module_cache_version_mismatch_total", query: fmt.Sprintf("increase(platform_workflow_module_cache_version_mismatch_total{node_don=\"%%s\", node_index=\"%%d\"}[%s])", cachePrometheusRange), From 56ccaf60fca9f19cdb736b59675600b7dbfcf2e1 Mon Sep 17 00:00:00 2001 From: mchain0 Date: Wed, 20 May 2026 11:00:03 +0200 Subject: [PATCH 11/13] cre-4396: more metrics; adjusted params --- .../tests/soak/cre/workflow_caching_test.go | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/system-tests/tests/soak/cre/workflow_caching_test.go b/system-tests/tests/soak/cre/workflow_caching_test.go index 9226eeb53cb..6e14275813a 100644 --- a/system-tests/tests/soak/cre/workflow_caching_test.go +++ b/system-tests/tests/soak/cre/workflow_caching_test.go @@ -151,6 +151,20 @@ func Test_V2_CRE_CacheSoak(t *testing.T) { filename: "metrics/cache_reload_increase.json", step: defaultMetricStep, }, + { + // Counter: WASM re-instantiated from on-disk cache (cold load path). + metric: "platform_workflow_module_cache_reload_total", + query: fmt.Sprintf("increase(platform_workflow_module_cache_reload_total{node_don=\"%%s\", node_index=\"%%d\", source=\"disk\"}[%s])", cachePrometheusRange), + filename: "metrics/cache_reload_disk_increase.json", + step: defaultMetricStep, + }, + { + // Counter: WASM resurrected from in-memory weak ref (warm load path). + metric: "platform_workflow_module_cache_reload_total", + query: fmt.Sprintf("increase(platform_workflow_module_cache_reload_total{node_don=\"%%s\", node_index=\"%%d\", source=\"weak_ref\"}[%s])", cachePrometheusRange), + filename: "metrics/cache_reload_memory_increase.json", + step: defaultMetricStep, + }, { metric: "platform_workflow_module_cache_eviction_total", query: fmt.Sprintf("increase(platform_workflow_module_cache_eviction_total{node_don=\"%%s\", node_index=\"%%d\"}[%s])", cachePrometheusRange), @@ -233,6 +247,39 @@ func Test_V2_CRE_CacheSoak(t *testing.T) { filename: "metrics/engine_trigger_event_received_increase.json", step: defaultMetricStep, }, + // Engine schedule skew (histograms). No per-execution cache-path label exists; correlate skew + // with cache_reload_disk_increase vs cache_reload_memory_increase in the same time step. + // Hot LRU hits produce neither reload counter. + { + metric: "platform_engine_trigger_queue_to_execution_start_seconds", + query: histogramQuantileQuery("platform_engine_trigger_queue_to_execution_start_seconds", 0.50), + filename: "metrics/engine_trigger_skew_p50.json", + step: defaultMetricStep, + }, + { + metric: "platform_engine_trigger_queue_to_execution_start_seconds", + query: histogramQuantileQuery("platform_engine_trigger_queue_to_execution_start_seconds", 0.95), + filename: "metrics/engine_trigger_skew_p95.json", + step: defaultMetricStep, + }, + { + metric: "platform_engine_trigger_event_queue_wait_seconds", + query: histogramQuantileQuery("platform_engine_trigger_event_queue_wait_seconds", 0.95), + filename: "metrics/engine_trigger_queue_wait_p95.json", + step: defaultMetricStep, + }, + { + metric: "platform_engine_execution_semaphore_wait_seconds", + query: histogramQuantileQuery("platform_engine_execution_semaphore_wait_seconds", 0.95), + filename: "metrics/engine_execution_semaphore_wait_p95.json", + step: defaultMetricStep, + }, + { + metric: "platform_engine_workflow_completed_time_seconds", + query: histogramQuantileQuery("platform_engine_workflow_completed_time_seconds", 0.95), + filename: "metrics/engine_execution_duration_p95.json", + step: defaultMetricStep, + }, { metric: "platform_workflow_module_cache_version_mismatch_total", query: fmt.Sprintf("increase(platform_workflow_module_cache_version_mismatch_total{node_don=\"%%s\", node_index=\"%%d\"}[%s])", cachePrometheusRange), @@ -303,6 +350,14 @@ func crePerWorkflowSizeLimitMiB(size config.Size) int { return int(size / config.MByte) } +// histogramQuantileQuery aggregates per-workflow engine histograms on a node (sum by le). +func histogramQuantileQuery(metric string, quantile float64) string { + return fmt.Sprintf( + `histogram_quantile(%g, sum by (le) (rate(%s_bucket{node_don="%%s", node_index="%%d"}[%s])))`, + quantile, metric, cachePrometheusRange, + ) +} + // cacheSoakWorkflowSchedule returns a minute-granularity cron schedule aligned with the soak topology. // Workflows with index divisible by cacheSoakSchedulePeriod use slowCronInterval (idle eviction tier); // the rest use fastCronInterval to keep MaxLoaded full and drive cap eviction. From 91df8fd424091a6fd6df535c6d109ee09e7dd1e4 Mon Sep 17 00:00:00 2001 From: mchain0 Date: Wed, 20 May 2026 15:20:35 +0200 Subject: [PATCH 12/13] cre-4396: skew metrics improvement with an extension merged into develop in parallel --- .../tests/soak/cre/workflow_caching_test.go | 29 +++++++++++++++++-- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/system-tests/tests/soak/cre/workflow_caching_test.go b/system-tests/tests/soak/cre/workflow_caching_test.go index 6e14275813a..01671e0e94f 100644 --- a/system-tests/tests/soak/cre/workflow_caching_test.go +++ b/system-tests/tests/soak/cre/workflow_caching_test.go @@ -247,9 +247,7 @@ func Test_V2_CRE_CacheSoak(t *testing.T) { filename: "metrics/engine_trigger_event_received_increase.json", step: defaultMetricStep, }, - // Engine schedule skew (histograms). No per-execution cache-path label exists; correlate skew - // with cache_reload_disk_increase vs cache_reload_memory_increase in the same time step. - // Hot LRU hits produce neither reload counter. + // Engine schedule skew by module cache path (source: loaded | weak_ref | disk). { metric: "platform_engine_trigger_queue_to_execution_start_seconds", query: histogramQuantileQuery("platform_engine_trigger_queue_to_execution_start_seconds", 0.50), @@ -262,6 +260,24 @@ func Test_V2_CRE_CacheSoak(t *testing.T) { filename: "metrics/engine_trigger_skew_p95.json", step: defaultMetricStep, }, + { + metric: "platform_engine_trigger_queue_to_execution_start_seconds", + query: histogramQuantileQueryBySource("platform_engine_trigger_queue_to_execution_start_seconds", 0.95, "loaded"), + filename: "metrics/engine_trigger_skew_loaded_p95.json", + step: defaultMetricStep, + }, + { + metric: "platform_engine_trigger_queue_to_execution_start_seconds", + query: histogramQuantileQueryBySource("platform_engine_trigger_queue_to_execution_start_seconds", 0.95, "weak_ref"), + filename: "metrics/engine_trigger_skew_weak_ref_p95.json", + step: defaultMetricStep, + }, + { + metric: "platform_engine_trigger_queue_to_execution_start_seconds", + query: histogramQuantileQueryBySource("platform_engine_trigger_queue_to_execution_start_seconds", 0.95, "disk"), + filename: "metrics/engine_trigger_skew_disk_p95.json", + step: defaultMetricStep, + }, { metric: "platform_engine_trigger_event_queue_wait_seconds", query: histogramQuantileQuery("platform_engine_trigger_event_queue_wait_seconds", 0.95), @@ -358,6 +374,13 @@ func histogramQuantileQuery(metric string, quantile float64) string { ) } +func histogramQuantileQueryBySource(metric string, quantile float64, source string) string { + return fmt.Sprintf( + `histogram_quantile(%g, sum by (le) (rate(%s_bucket{node_don="%%s", node_index="%%d", source="%s"}[%s])))`, + quantile, metric, source, cachePrometheusRange, + ) +} + // cacheSoakWorkflowSchedule returns a minute-granularity cron schedule aligned with the soak topology. // Workflows with index divisible by cacheSoakSchedulePeriod use slowCronInterval (idle eviction tier); // the rest use fastCronInterval to keep MaxLoaded full and drive cap eviction. From b37fadc3b5078302ed55d7160110a0e72914db47 Mon Sep 17 00:00:00 2001 From: mchain0 Date: Wed, 20 May 2026 15:24:04 +0200 Subject: [PATCH 13/13] cre-4396: added diskmonitor metrics --- .../tests/soak/cre/workflow_caching_test.go | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/system-tests/tests/soak/cre/workflow_caching_test.go b/system-tests/tests/soak/cre/workflow_caching_test.go index 01671e0e94f..85a7cea9ec7 100644 --- a/system-tests/tests/soak/cre/workflow_caching_test.go +++ b/system-tests/tests/soak/cre/workflow_caching_test.go @@ -185,6 +185,21 @@ func Test_V2_CRE_CacheSoak(t *testing.T) { filename: "metrics/cache_memory_saved_bytes.json", step: defaultMetricStep, }, + { + // Gauge: total on-disk bytes under the module cache dir (DiskMonitor, ~1m tick). + // max_over_time: peak footprint per step; correlates with deploy count and disk reloads. + metric: "platform_workflow_module_cache_disk_usage_bytes", + query: fmt.Sprintf("max_over_time(platform_workflow_module_cache_disk_usage_bytes{node_don=\"%%s\", node_index=\"%%d\"}[%s])", cachePrometheusRange), + filename: "metrics/cache_disk_usage_bytes.json", + step: defaultMetricStep, + }, + { + // Gauge: typical on-disk cache footprint during the step (smoothed). + metric: "platform_workflow_module_cache_disk_usage_bytes", + query: fmt.Sprintf("avg_over_time(platform_workflow_module_cache_disk_usage_bytes{node_don=\"%%s\", node_index=\"%%d\"}[%s])", cachePrometheusRange), + filename: "metrics/cache_disk_usage_avg_bytes.json", + step: defaultMetricStep, + }, { // Gauge: workflows fetched from registry on last sync tick (registered on this node). metric: "platform_workflow_registry_syncer_fetched_workflows",