From 433d0d788a8aab1a6d52b4e0766937c148716cb0 Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Tue, 2 Jun 2026 17:36:08 +0000 Subject: [PATCH 1/2] [SPARK-57221][INFRA] Make merge_spark_pr.py fail fast on missing JIRA setup and stale script versions ### What changes were proposed in this pull request? Harden `dev/merge_spark_pr.py` so it refuses to run when a prerequisite for a correct merge is missing, instead of silently proceeding: - `initialize_jira()` now fails fast when the `jira` library is not installed or when no JIRA credentials are configured, rather than prompting "Continue without jira?" and merging without resolving the ticket. This runs before any git work, so nothing is merged when it aborts. - Add `check_script_up_to_date()`, run first in `main()`, which asks GitHub for the latest commit on master touching this script and fails if that commit is not in the local history (i.e. the committer is missing an upstream update). It compares commit reachability rather than downloading the file, so being ahead of master is fine. Set `SKIP_VERSION_CHECK=1` to bypass when iterating on the script itself. - Drop the now-unreachable `asf_jira is None` guards and the misleading "Exiting without trying to close the associated JIRA" branch. ### Why are the changes needed? When the `jira` library is not installed (or credentials are unset), the merge script would prompt to continue without JIRA and then merge the PR while leaving the associated ticket Open. This is easy to miss and tedious to reconcile later. A stale local copy of the merge script can likewise reintroduce already-fixed behavior. Failing loudly on both makes a correct merge the only path. ### Does this PR introduce _any_ user-facing change? No. This only affects committers running the merge tooling. ### How was this patch tested? - `python3 -m py_compile dev/merge_spark_pr.py` and `python3 -m doctest dev/merge_spark_pr.py`. - Exercised `check_script_up_to_date()` for the up-to-date/ahead (pass), `SKIP_VERSION_CHECK` (bypass), and missing-upstream-commit (loud fail) paths. ### Was this patch authored or co-authored using generative AI tooling? Generated-by: Claude Code (Opus 4.8) Co-authored-by: Isaac --- dev/merge_spark_pr.py | 136 ++++++++++++++++++++++++++++++------------ 1 file changed, 97 insertions(+), 39 deletions(-) diff --git a/dev/merge_spark_pr.py b/dev/merge_spark_pr.py index c82b930e56ec9..a3da0102e0879 100755 --- a/dev/merge_spark_pr.py +++ b/dev/merge_spark_pr.py @@ -86,6 +86,11 @@ JIRA_API_BASE = "https://issues.apache.org/jira" # Prefix added to temporary branches BRANCH_PREFIX = "PR_TOOL" +# Set to a truthy value to skip the check that compares this script against apache/spark master. +# Intended for committers iterating on the merge script itself, where a local diff is expected. +SKIP_VERSION_CHECK = os.environ.get("SKIP_VERSION_CHECK", "") +# Path of this script relative to the repo root, used to fetch the canonical copy from master. +MERGE_SCRIPT_REPO_PATH = "dev/merge_spark_pr.py" def semver_branch_rank(name): @@ -1063,13 +1068,15 @@ def get_current_ref(): def initialize_jira(): + # JIRA access is mandatory: a successful merge must resolve the associated ticket. Bail out + # early -- before any git work -- if a prerequisite is missing, rather than silently merging + # and leaving the JIRA open (which is easy to miss and tedious to reconcile after the fact). global asf_jira asf_jira = None jira_server = {"server": JIRA_API_BASE} if not JIRA_IMPORTED: - print_error("ERROR finding jira library. Run 'pip3 install jira' to install.") - continue_maybe("Continue without jira?") + fail("ERROR finding jira library. Run 'pip3 install jira' to install, then retry.") elif JIRA_ACCESS_TOKEN: client = jira.client.JIRA( jira_server, token_auth=JIRA_ACCESS_TOKEN, timeout=(JIRA_CONNECT_TIMEOUT, 30) @@ -1098,11 +1105,64 @@ def initialize_jira(): timeout=(JIRA_CONNECT_TIMEOUT, 30), ) else: - print("Neither JIRA_ACCESS_TOKEN nor JIRA_USERNAME/JIRA_PASSWORD are set.") - continue_maybe("Continue without jira?") + fail( + "Neither JIRA_ACCESS_TOKEN nor JIRA_USERNAME/JIRA_PASSWORD are set. Configure JIRA " + "credentials (a personal access token is recommended) and retry. See " + "https://issues.apache.org/jira/secure/ViewProfile.jspa -> Personal Access Tokens." + ) + + +def check_script_up_to_date(): + # Running a stale merge script can silently reintroduce already-fixed behavior (e.g. skipping + # JIRA resolution). Rather than download the whole file, ask GitHub only for the latest commit + # on master that touched this script (a small response) and fail if that commit is not already + # in the local history -- i.e. the committer is missing an upstream update. Being ahead of + # master (branched from it) is fine and does not trip the check. Best-effort: if master cannot + # be reached we warn and proceed rather than block merges on a transient network/rate error. + if SKIP_VERSION_CHECK: + print("SKIP_VERSION_CHECK is set; not verifying this script against master.") + return + + url = "%s/commits?path=%s&sha=master&per_page=1" % (GITHUB_API_BASE, MERGE_SCRIPT_REPO_PATH) + try: + request = Request(url) + if GITHUB_OAUTH_KEY: + request.add_header("Authorization", "token %s" % GITHUB_OAUTH_KEY) + commits = json.load(urlopen(request)) + except Exception as e: + print_error( + "Could not check %s against master (%s). Proceeding with the local copy." + % (MERGE_SCRIPT_REPO_PATH, e) + ) + return + if not commits: + return + latest_sha = commits[0]["sha"] + + # `git merge-base --is-ancestor` exits 0 when latest_sha is reachable from HEAD (we have it), + # nonzero when it is missing (stale) or unknown locally (never fetched). + try: + subprocess.check_call( + ["git", "merge-base", "--is-ancestor", latest_sha, "HEAD"], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + return + except subprocess.CalledProcessError: + pass + + fail( + "Your local %s is out of date: apache/spark master has a newer revision (%s) that is not " + "in your local history. Update it before merging, e.g.:\n" + " git fetch %s master && git checkout %s/master -- %s\n" + "then re-run. Set SKIP_VERSION_CHECK=1 to bypass (e.g. when changing this script itself)." + % (MERGE_SCRIPT_REPO_PATH, latest_sha[:12], PUSH_REMOTE_NAME, PUSH_REMOTE_NAME, + MERGE_SCRIPT_REPO_PATH) + ) def main(): + check_script_up_to_date() initialize_jira() global original_head @@ -1266,31 +1326,31 @@ def main(): ) continue_maybe(msg) - if asf_jira is not None: - jira_ids = re.findall("SPARK-[0-9]{4,5}", title) - # Epic / Umbrella tickets group related work and must not be resolved by a single PR. - # Collect every offender so the committer sees the full list in one shot rather than - # discovering them one-by-one across repeated merge attempts. - blocking_issue_types = {"Epic", "Umbrella"} - blockers = [] - for jira_id in jira_ids: - try: - issue = asf_jira.issue(jira_id) - except Exception: - print_error("Unable to fetch summary of %s" % jira_id) - continue - print_jira_issue_summary(issue) - issue_type = issue.fields.issuetype.name - if issue_type in blocking_issue_types: - blockers.append((jira_id, issue_type)) - if blockers: - ids_str = ", ".join("%s (%s)" % (jid, t) for jid, t in blockers) - fail( - "Cannot merge PR #%s. Linked JIRA(s) %s are Umbrella or Epic " - "tickets and MUST not be resolved by a single PR. File " - "Sub-task(s) under %s and update the PR title to reference " - "the Sub-task(s) instead." % (pr_num, ids_str, ids_str) - ) + # asf_jira is guaranteed to be set here: initialize_jira() fails fast otherwise. + jira_ids = re.findall("SPARK-[0-9]{4,5}", title) + # Epic / Umbrella tickets group related work and must not be resolved by a single PR. + # Collect every offender so the committer sees the full list in one shot rather than + # discovering them one-by-one across repeated merge attempts. + blocking_issue_types = {"Epic", "Umbrella"} + blockers = [] + for jira_id in jira_ids: + try: + issue = asf_jira.issue(jira_id) + except Exception: + print_error("Unable to fetch summary of %s" % jira_id) + continue + print_jira_issue_summary(issue) + issue_type = issue.fields.issuetype.name + if issue_type in blocking_issue_types: + blockers.append((jira_id, issue_type)) + if blockers: + ids_str = ", ".join("%s (%s)" % (jid, t) for jid, t in blockers) + fail( + "Cannot merge PR #%s. Linked JIRA(s) %s are Umbrella or Epic " + "tickets and MUST not be resolved by a single PR. File " + "Sub-task(s) under %s and update the PR title to reference " + "the Sub-task(s) instead." % (pr_num, ids_str, ids_str) + ) print("\n=== Pull Request #%s ===" % pr_num) print("title\t%s\nsource\t%s\ntarget\t%s\nurl\t%s" % (title, pr_repo_desc, target_ref, url)) @@ -1330,16 +1390,14 @@ def main(): if b in remaining_branches: remaining_branches.remove(b) - if asf_jira is not None: - continue_maybe("Would you like to update an associated JIRA?") - jira_comment = "Issue resolved by pull request %s\n[%s/%s]" % ( - pr_num, - GITHUB_BASE, - pr_num, - ) - resolve_jira_issues(title, merged_refs, jira_comment) - else: - print("Exiting without trying to close the associated JIRA.") + # asf_jira is guaranteed to be set here: initialize_jira() fails fast otherwise. + continue_maybe("Would you like to update an associated JIRA?") + jira_comment = "Issue resolved by pull request %s\n[%s/%s]" % ( + pr_num, + GITHUB_BASE, + pr_num, + ) + resolve_jira_issues(title, merged_refs, jira_comment) if __name__ == "__main__": From 118e02b509ce1e873ac5d19534a19f5736c5c4c3 Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Wed, 3 Jun 2026 05:56:44 +0000 Subject: [PATCH 2/2] Fix black formatting Co-authored-by: Isaac --- dev/merge_spark_pr.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/dev/merge_spark_pr.py b/dev/merge_spark_pr.py index a3da0102e0879..9fdffb7ab13ee 100755 --- a/dev/merge_spark_pr.py +++ b/dev/merge_spark_pr.py @@ -1156,8 +1156,13 @@ def check_script_up_to_date(): "in your local history. Update it before merging, e.g.:\n" " git fetch %s master && git checkout %s/master -- %s\n" "then re-run. Set SKIP_VERSION_CHECK=1 to bypass (e.g. when changing this script itself)." - % (MERGE_SCRIPT_REPO_PATH, latest_sha[:12], PUSH_REMOTE_NAME, PUSH_REMOTE_NAME, - MERGE_SCRIPT_REPO_PATH) + % ( + MERGE_SCRIPT_REPO_PATH, + latest_sha[:12], + PUSH_REMOTE_NAME, + PUSH_REMOTE_NAME, + MERGE_SCRIPT_REPO_PATH, + ) )