Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 14 additions & 7 deletions .github/actions/setup-build-environment/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ inputs:
cache-key:
description: 'Cache key identifier for Go cache'
required: true
save-cache:
description: 'Whether this job may save the Go cache (only effective on main). Set to false on jobs that share a cache-key across many matrix instances so only one designated job writes the key.'
required: false
default: 'true'

runs:
using: 'composite'
Expand Down Expand Up @@ -38,9 +42,12 @@ runs:
# On runs against main (push + the scheduled wipe-and-repopulate
# cron added in #2092): restore now, save at job end via the
# unified action's post-step (which fires at the calling job's
# end, even when invoked from a composite).
# end, even when invoked from a composite). Gated on save-cache so
# that when many matrix instances share one cache-key, only the
# designated job writes it (concurrent same-key saves all fail but
# the first, so the extra writers just waste time).
- name: Restore and save Go cache (main)
if: github.ref == 'refs/heads/main'
if: github.ref == 'refs/heads/main' && inputs.save-cache == 'true'
uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
with:
path: |
Expand All @@ -50,12 +57,12 @@ runs:
restore-keys: |
setup-go-${{ inputs.cache-key }}-${{ runner.os }}-go${{ steps.setup-go.outputs.go-version }}-

# On every other ref (PR / merge_group): restore only. Prefix
# fallback via restore-keys means runs whose go.sum differs from
# main still restore main's most recent cache and rebuild only
# the delta.
# On every other ref (PR / merge_group) or when this job is not the
# designated cache writer: restore only. Prefix fallback via
# restore-keys means runs whose go.sum differs from main still
# restore main's most recent cache and rebuild only the delta.
- name: Restore Go cache (non-main)
if: github.ref != 'refs/heads/main'
if: github.ref != 'refs/heads/main' || inputs.save-cache != 'true'
uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
with:
path: |
Expand Down
129 changes: 117 additions & 12 deletions .github/workflows/push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ jobs:
runs-on: ubuntu-latest
outputs:
targets: ${{ steps.mask1.outputs.targets || steps.mask2.outputs.targets || steps.mask3.outputs.targets }}
acc_matrix: ${{ steps.accmatrix.outputs.matrix }}
steps:
- name: Checkout repository
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
Expand Down Expand Up @@ -70,14 +71,60 @@ jobs:
# Always run all tests
echo "targets=[\"test\"]" >> $GITHUB_OUTPUT

test:
# Build the acceptance-test shard matrix. Shard counts vary per
# (os, engine), which a static cross-product matrix can't express, so we
# emit an explicit include-list consumed via fromJSON in the test job.
- name: Build acceptance test shard matrix
id: accmatrix
env:
EVENT_NAME: ${{ github.event_name }}
run: |
python3 - <<'PY' >> "$GITHUB_OUTPUT"
import json, os

event = os.environ["EVENT_NAME"]
runners = {
"linux": {"group": "databricks-protected-runner-group-large", "labels": "linux-ubuntu-latest-large"},
"windows": {"group": "databricks-protected-runner-group-large", "labels": "windows-server-latest-large"},
"macos": {"labels": "macos-latest"},
}
# (os, engine) -> shard count. Windows gets more shards because
# TASK_CONCURRENCY=1 serializes tests within each job, so the only
# way to cut its wall time is more parallel jobs. direct is faster
# than terraform and needs fewer shards.
shard_counts = {
("linux", "terraform"): 4,
("linux", "direct"): 2,
("macos", "terraform"): 4,
("macos", "direct"): 2,
("windows", "terraform"): 8,
("windows", "direct"): 4,
}

include = []
for (osname, engine), total in shard_counts.items():
# Run on Linux only in merge queue to reduce time to merge.
if event == "merge_group" and osname != "linux":
continue
for index in range(total):
include.append({
"os": {"name": osname, "runner": runners[osname]},
"deployment": engine,
"shard_index": index,
"shard_total": total,
})

print("matrix=" + json.dumps({"include": include}))
PY

test-unit:
needs:
- cleanups
- testmask

# Only run if the target is in the list of targets from testmask
if: ${{ contains(fromJSON(needs.testmask.outputs.targets), 'test') }}
name: "task test (${{matrix.os.name}}, ${{matrix.deployment}})"
name: "task test-unit (${{matrix.os.name}})"
runs-on: ${{ matrix.os.runner }}

defaults:
Expand All @@ -94,8 +141,6 @@ jobs:
strategy:
fail-fast: false
matrix:
# Use separate fields for the OS name and runner configuration.
# When combined in a single object, "runs-on" errors with "Unexpected value 'name'".
os:
- name: linux
runner:
Expand All @@ -111,10 +156,6 @@ jobs:
runner:
labels: macos-latest

deployment:
- "terraform"
- "direct"

# Include "event_name" in the matrix so we can include/exclude based on it.
event:
- ${{ github.event_name }}
Expand All @@ -135,20 +176,83 @@ jobs:
- name: Setup build environment
uses: ./.github/actions/setup-build-environment
with:
cache-key: test-${{ matrix.deployment }}
# Sole writer of the shared "test" cache (test-acc shards restore it).
cache-key: test

- name: Run tests
run: go tool -modfile=tools/task/go.mod task test-unit

- name: Upload gotestsum JSON output
# Always upload so we can inspect timing even if tests fail.
# This is debug-only telemetry; a flaky artifact upload must not fail
# an otherwise-passing job.
if: ${{ always() }}
continue-on-error: true
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: test-output-unit-${{ matrix.os.name }}
path: test-output-unit.json
if-no-files-found: warn
retention-days: 7

test:
needs:
- cleanups
- testmask

# Only run if the target is in the list of targets from testmask
if: ${{ contains(fromJSON(needs.testmask.outputs.targets), 'test') }}
name: "task test-acc (${{matrix.os.name}}, ${{matrix.deployment}}, shard ${{matrix.shard_index}}/${{matrix.shard_total}})"
runs-on: ${{ matrix.os.runner }}

defaults:
run:
shell: bash

permissions:
id-token: write
contents: read

env:
TASK_CONCURRENCY: ${{ matrix.os.name == 'windows' && '1' || '' }}

strategy:
fail-fast: false
# Generated by testmask: an include-list with per-(os, engine) shard
# counts. Each entry carries os{name,runner}, deployment, shard_index,
# and shard_total.
matrix: ${{ fromJSON(needs.testmask.outputs.acc_matrix) }}

steps:
- name: Checkout repository and submodules
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

- name: Setup build environment
uses: ./.github/actions/setup-build-environment
with:
# Shares the cache-key with test-unit so these shards restore the
# cache it saves. save-cache is false because many shard/deployment
# instances share this key; test-unit is the sole writer.
cache-key: test
save-cache: false

- name: Run tests
env:
ENVFILTER: DATABRICKS_BUNDLE_ENGINE=${{ matrix.deployment }}
run: go tool -modfile=tools/task/go.mod task test
SHARD_INDEX: ${{ matrix.shard_index }}
SHARD_TOTAL: ${{ matrix.shard_total }}
run: go tool -modfile=tools/task/go.mod task test-acc

- name: Upload gotestsum JSON output
# Always upload so we can inspect timing even if tests fail.
# This is debug-only telemetry; a flaky artifact upload must not fail
# an otherwise-passing job.
if: ${{ always() }}
continue-on-error: true
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: test-output-${{ matrix.os.name }}-${{ matrix.deployment }}
path: test-output.json
name: test-output-${{ matrix.os.name }}-${{ matrix.deployment }}-shard${{ matrix.shard_index }}
path: test-output-acc.json
if-no-files-found: warn
retention-days: 7

Expand Down Expand Up @@ -329,6 +433,7 @@ jobs:
# Reference: https://github.com/orgs/community/discussions/25970
test-result:
needs:
- test-unit
- test
- test-exp-aitools
- test-exp-ssh
Expand Down
22 changes: 22 additions & 0 deletions acceptance/acceptance_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,10 @@ func testAccept(t *testing.T, inprocessMode bool, singleTest string) int {
return n != singleTest
})
require.NotEmpty(t, testDirs, "singleTest=%#v did not match any tests\n%#v", singleTest, testDirs)
} else {
// Sharding applies only to the full run. A specific singleTest (e.g.
// TestInprocessMode) must never be filtered out by the shard split.
testDirs = shardTests(testDirs)
}

skippedDirs := 0
Expand Down Expand Up @@ -513,6 +517,24 @@ func getTests(t *testing.T) []string {
return testDirs
}

// shardTests returns the subset of testDirs assigned to this CI shard when
// SHARD_TOTAL > 1, or testDirs unchanged otherwise. testDirs must be sorted so
// the split is deterministic and stable across runs.
func shardTests(testDirs []string) []string {
total, _ := strconv.Atoi(os.Getenv("SHARD_TOTAL"))
if total <= 1 {
return testDirs
}
index, _ := strconv.Atoi(os.Getenv("SHARD_INDEX"))
sharded := testDirs[:0]
for i, d := range testDirs {
if i%total == index {
sharded = append(sharded, d)
}
}
return sharded
}

func validateTestPhase(phase int) error {
if phase == 0 || phase == 1 {
return nil
Expand Down
Loading