apache · cloud-fan · Jun 2, 2026
diff --git a/dev/merge_spark_pr.py b/dev/merge_spark_pr.py
@@ -86,6 +86,11 @@
 JIRA_API_BASE = "https://issues.apache.org/jira"
 # Prefix added to temporary branches
 BRANCH_PREFIX = "PR_TOOL"
+# Set to a truthy value to skip the check that compares this script against apache/spark master.
+# Intended for committers iterating on the merge script itself, where a local diff is expected.
+SKIP_VERSION_CHECK = os.environ.get("SKIP_VERSION_CHECK", "")
+# Path of this script relative to the repo root, used to fetch the canonical copy from master.
+MERGE_SCRIPT_REPO_PATH = "dev/merge_spark_pr.py"
 
 
 def semver_branch_rank(name):
@@ -1063,13 +1068,15 @@ def get_current_ref():
 
 
 def initialize_jira():
+    # JIRA access is mandatory: a successful merge must resolve the associated ticket. Bail out
+    # early -- before any git work -- if a prerequisite is missing, rather than silently merging
+    # and leaving the JIRA open (which is easy to miss and tedious to reconcile after the fact).
     global asf_jira
     asf_jira = None
     jira_server = {"server": JIRA_API_BASE}
 
     if not JIRA_IMPORTED:
-        print_error("ERROR finding jira library. Run 'pip3 install jira' to install.")
-        continue_maybe("Continue without jira?")
+        fail("ERROR finding jira library. Run 'pip3 install jira' to install, then retry.")
     elif JIRA_ACCESS_TOKEN:
         client = jira.client.JIRA(
             jira_server, token_auth=JIRA_ACCESS_TOKEN, timeout=(JIRA_CONNECT_TIMEOUT, 30)
@@ -1098,11 +1105,64 @@ def initialize_jira():
             timeout=(JIRA_CONNECT_TIMEOUT, 30),
         )
     else:
-        print("Neither JIRA_ACCESS_TOKEN nor JIRA_USERNAME/JIRA_PASSWORD are set.")
-        continue_maybe("Continue without jira?")
+        fail(
+            "Neither JIRA_ACCESS_TOKEN nor JIRA_USERNAME/JIRA_PASSWORD are set. Configure JIRA "
+            "credentials (a personal access token is recommended) and retry. See "
+            "https://issues.apache.org/jira/secure/ViewProfile.jspa -> Personal Access Tokens."
+        )
+
+
+def check_script_up_to_date():
+    # Running a stale merge script can silently reintroduce already-fixed behavior (e.g. skipping
+    # JIRA resolution). Rather than download the whole file, ask GitHub only for the latest commit
+    # on master that touched this script (a small response) and fail if that commit is not already
+    # in the local history -- i.e. the committer is missing an upstream update. Being ahead of
+    # master (branched from it) is fine and does not trip the check. Best-effort: if master cannot
+    # be reached we warn and proceed rather than block merges on a transient network/rate error.
+    if SKIP_VERSION_CHECK:
+        print("SKIP_VERSION_CHECK is set; not verifying this script against master.")
+        return
+
+    url = "%s/commits?path=%s&sha=master&per_page=1" % (GITHUB_API_BASE, MERGE_SCRIPT_REPO_PATH)
+    try:
+        request = Request(url)
+        if GITHUB_OAUTH_KEY:
+            request.add_header("Authorization", "token %s" % GITHUB_OAUTH_KEY)
+        commits = json.load(urlopen(request))
+    except Exception as e:
+        print_error(
+            "Could not check %s against master (%s). Proceeding with the local copy."
+            % (MERGE_SCRIPT_REPO_PATH, e)
+        )
+        return
+    if not commits:
+        return
+    latest_sha = commits[0]["sha"]
+
+    # `git merge-base --is-ancestor` exits 0 when latest_sha is reachable from HEAD (we have it),
+    # nonzero when it is missing (stale) or unknown locally (never fetched).
+    try:
+        subprocess.check_call(
+            ["git", "merge-base", "--is-ancestor", latest_sha, "HEAD"],
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+        )
+        return
+    except subprocess.CalledProcessError:
+        pass
+
+    fail(
+        "Your local %s is out of date: apache/spark master has a newer revision (%s) that is not "
+        "in your local history. Update it before merging, e.g.:\n"
+        "    git fetch %s master && git checkout %s/master -- %s\n"
+        "then re-run. Set SKIP_VERSION_CHECK=1 to bypass (e.g. when changing this script itself)."
+        % (MERGE_SCRIPT_REPO_PATH, latest_sha[:12], PUSH_REMOTE_NAME, PUSH_REMOTE_NAME,
+           MERGE_SCRIPT_REPO_PATH)
+    )
 
 
 def main():
+    check_script_up_to_date()
     initialize_jira()
     global original_head
 
@@ -1266,31 +1326,31 @@ def main():
         )
         continue_maybe(msg)
 
-    if asf_jira is not None:
-        jira_ids = re.findall("SPARK-[0-9]{4,5}", title)
-        # Epic / Umbrella tickets group related work and must not be resolved by a single PR.
-        # Collect every offender so the committer sees the full list in one shot rather than
-        # discovering them one-by-one across repeated merge attempts.
-        blocking_issue_types = {"Epic", "Umbrella"}
-        blockers = []
-        for jira_id in jira_ids:
-            try:
-                issue = asf_jira.issue(jira_id)
-            except Exception:
-                print_error("Unable to fetch summary of %s" % jira_id)
-                continue
-            print_jira_issue_summary(issue)
-            issue_type = issue.fields.issuetype.name
-            if issue_type in blocking_issue_types:
-                blockers.append((jira_id, issue_type))
-        if blockers:
-            ids_str = ", ".join("%s (%s)" % (jid, t) for jid, t in blockers)
-            fail(
-                "Cannot merge PR #%s. Linked JIRA(s) %s are Umbrella or Epic "
-                "tickets and MUST not be resolved by a single PR. File "
-                "Sub-task(s) under %s and update the PR title to reference "
-                "the Sub-task(s) instead." % (pr_num, ids_str, ids_str)
-            )
+    # asf_jira is guaranteed to be set here: initialize_jira() fails fast otherwise.
+    jira_ids = re.findall("SPARK-[0-9]{4,5}", title)
+    # Epic / Umbrella tickets group related work and must not be resolved by a single PR.
+    # Collect every offender so the committer sees the full list in one shot rather than
+    # discovering them one-by-one across repeated merge attempts.
+    blocking_issue_types = {"Epic", "Umbrella"}
+    blockers = []
+    for jira_id in jira_ids:
+        try:
+            issue = asf_jira.issue(jira_id)
+        except Exception:
+            print_error("Unable to fetch summary of %s" % jira_id)
+            continue
+        print_jira_issue_summary(issue)
+        issue_type = issue.fields.issuetype.name
+        if issue_type in blocking_issue_types:
+            blockers.append((jira_id, issue_type))
+    if blockers:
+        ids_str = ", ".join("%s (%s)" % (jid, t) for jid, t in blockers)
+        fail(
+            "Cannot merge PR #%s. Linked JIRA(s) %s are Umbrella or Epic "
+            "tickets and MUST not be resolved by a single PR. File "
+            "Sub-task(s) under %s and update the PR title to reference "
+            "the Sub-task(s) instead." % (pr_num, ids_str, ids_str)
+        )
 
     print("\n=== Pull Request #%s ===" % pr_num)
     print("title\t%s\nsource\t%s\ntarget\t%s\nurl\t%s" % (title, pr_repo_desc, target_ref, url))
@@ -1330,16 +1390,14 @@ def main():
             if b in remaining_branches:
                 remaining_branches.remove(b)
 
-    if asf_jira is not None:
-        continue_maybe("Would you like to update an associated JIRA?")
-        jira_comment = "Issue resolved by pull request %s\n[%s/%s]" % (
-            pr_num,
-            GITHUB_BASE,
-            pr_num,
-        )
-        resolve_jira_issues(title, merged_refs, jira_comment)
-    else:
-        print("Exiting without trying to close the associated JIRA.")
+    # asf_jira is guaranteed to be set here: initialize_jira() fails fast otherwise.
+    continue_maybe("Would you like to update an associated JIRA?")
+    jira_comment = "Issue resolved by pull request %s\n[%s/%s]" % (
+        pr_num,
+        GITHUB_BASE,
+        pr_num,
+    )
+    resolve_jira_issues(title, merged_refs, jira_comment)
 
 
 if __name__ == "__main__":