diff --git a/dev/merge_spark_pr.py b/dev/merge_spark_pr.py index c82b930e56ec..9fdffb7ab13e 100755 --- a/dev/merge_spark_pr.py +++ b/dev/merge_spark_pr.py @@ -86,6 +86,11 @@ JIRA_API_BASE = "https://issues.apache.org/jira" # Prefix added to temporary branches BRANCH_PREFIX = "PR_TOOL" +# Set to a truthy value to skip the check that compares this script against apache/spark master. +# Intended for committers iterating on the merge script itself, where a local diff is expected. +SKIP_VERSION_CHECK = os.environ.get("SKIP_VERSION_CHECK", "") +# Path of this script relative to the repo root, used to fetch the canonical copy from master. +MERGE_SCRIPT_REPO_PATH = "dev/merge_spark_pr.py" def semver_branch_rank(name): @@ -1063,13 +1068,15 @@ def get_current_ref(): def initialize_jira(): + # JIRA access is mandatory: a successful merge must resolve the associated ticket. Bail out + # early -- before any git work -- if a prerequisite is missing, rather than silently merging + # and leaving the JIRA open (which is easy to miss and tedious to reconcile after the fact). global asf_jira asf_jira = None jira_server = {"server": JIRA_API_BASE} if not JIRA_IMPORTED: - print_error("ERROR finding jira library. Run 'pip3 install jira' to install.") - continue_maybe("Continue without jira?") + fail("ERROR finding jira library. Run 'pip3 install jira' to install, then retry.") elif JIRA_ACCESS_TOKEN: client = jira.client.JIRA( jira_server, token_auth=JIRA_ACCESS_TOKEN, timeout=(JIRA_CONNECT_TIMEOUT, 30) @@ -1098,11 +1105,69 @@ def initialize_jira(): timeout=(JIRA_CONNECT_TIMEOUT, 30), ) else: - print("Neither JIRA_ACCESS_TOKEN nor JIRA_USERNAME/JIRA_PASSWORD are set.") - continue_maybe("Continue without jira?") + fail( + "Neither JIRA_ACCESS_TOKEN nor JIRA_USERNAME/JIRA_PASSWORD are set. Configure JIRA " + "credentials (a personal access token is recommended) and retry. See " + "https://issues.apache.org/jira/secure/ViewProfile.jspa -> Personal Access Tokens." + ) + + +def check_script_up_to_date(): + # Running a stale merge script can silently reintroduce already-fixed behavior (e.g. skipping + # JIRA resolution). Rather than download the whole file, ask GitHub only for the latest commit + # on master that touched this script (a small response) and fail if that commit is not already + # in the local history -- i.e. the committer is missing an upstream update. Being ahead of + # master (branched from it) is fine and does not trip the check. Best-effort: if master cannot + # be reached we warn and proceed rather than block merges on a transient network/rate error. + if SKIP_VERSION_CHECK: + print("SKIP_VERSION_CHECK is set; not verifying this script against master.") + return + + url = "%s/commits?path=%s&sha=master&per_page=1" % (GITHUB_API_BASE, MERGE_SCRIPT_REPO_PATH) + try: + request = Request(url) + if GITHUB_OAUTH_KEY: + request.add_header("Authorization", "token %s" % GITHUB_OAUTH_KEY) + commits = json.load(urlopen(request)) + except Exception as e: + print_error( + "Could not check %s against master (%s). Proceeding with the local copy." + % (MERGE_SCRIPT_REPO_PATH, e) + ) + return + if not commits: + return + latest_sha = commits[0]["sha"] + + # `git merge-base --is-ancestor` exits 0 when latest_sha is reachable from HEAD (we have it), + # nonzero when it is missing (stale) or unknown locally (never fetched). + try: + subprocess.check_call( + ["git", "merge-base", "--is-ancestor", latest_sha, "HEAD"], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + return + except subprocess.CalledProcessError: + pass + + fail( + "Your local %s is out of date: apache/spark master has a newer revision (%s) that is not " + "in your local history. Update it before merging, e.g.:\n" + " git fetch %s master && git checkout %s/master -- %s\n" + "then re-run. Set SKIP_VERSION_CHECK=1 to bypass (e.g. when changing this script itself)." + % ( + MERGE_SCRIPT_REPO_PATH, + latest_sha[:12], + PUSH_REMOTE_NAME, + PUSH_REMOTE_NAME, + MERGE_SCRIPT_REPO_PATH, + ) + ) def main(): + check_script_up_to_date() initialize_jira() global original_head @@ -1266,31 +1331,31 @@ def main(): ) continue_maybe(msg) - if asf_jira is not None: - jira_ids = re.findall("SPARK-[0-9]{4,5}", title) - # Epic / Umbrella tickets group related work and must not be resolved by a single PR. - # Collect every offender so the committer sees the full list in one shot rather than - # discovering them one-by-one across repeated merge attempts. - blocking_issue_types = {"Epic", "Umbrella"} - blockers = [] - for jira_id in jira_ids: - try: - issue = asf_jira.issue(jira_id) - except Exception: - print_error("Unable to fetch summary of %s" % jira_id) - continue - print_jira_issue_summary(issue) - issue_type = issue.fields.issuetype.name - if issue_type in blocking_issue_types: - blockers.append((jira_id, issue_type)) - if blockers: - ids_str = ", ".join("%s (%s)" % (jid, t) for jid, t in blockers) - fail( - "Cannot merge PR #%s. Linked JIRA(s) %s are Umbrella or Epic " - "tickets and MUST not be resolved by a single PR. File " - "Sub-task(s) under %s and update the PR title to reference " - "the Sub-task(s) instead." % (pr_num, ids_str, ids_str) - ) + # asf_jira is guaranteed to be set here: initialize_jira() fails fast otherwise. + jira_ids = re.findall("SPARK-[0-9]{4,5}", title) + # Epic / Umbrella tickets group related work and must not be resolved by a single PR. + # Collect every offender so the committer sees the full list in one shot rather than + # discovering them one-by-one across repeated merge attempts. + blocking_issue_types = {"Epic", "Umbrella"} + blockers = [] + for jira_id in jira_ids: + try: + issue = asf_jira.issue(jira_id) + except Exception: + print_error("Unable to fetch summary of %s" % jira_id) + continue + print_jira_issue_summary(issue) + issue_type = issue.fields.issuetype.name + if issue_type in blocking_issue_types: + blockers.append((jira_id, issue_type)) + if blockers: + ids_str = ", ".join("%s (%s)" % (jid, t) for jid, t in blockers) + fail( + "Cannot merge PR #%s. Linked JIRA(s) %s are Umbrella or Epic " + "tickets and MUST not be resolved by a single PR. File " + "Sub-task(s) under %s and update the PR title to reference " + "the Sub-task(s) instead." % (pr_num, ids_str, ids_str) + ) print("\n=== Pull Request #%s ===" % pr_num) print("title\t%s\nsource\t%s\ntarget\t%s\nurl\t%s" % (title, pr_repo_desc, target_ref, url)) @@ -1330,16 +1395,14 @@ def main(): if b in remaining_branches: remaining_branches.remove(b) - if asf_jira is not None: - continue_maybe("Would you like to update an associated JIRA?") - jira_comment = "Issue resolved by pull request %s\n[%s/%s]" % ( - pr_num, - GITHUB_BASE, - pr_num, - ) - resolve_jira_issues(title, merged_refs, jira_comment) - else: - print("Exiting without trying to close the associated JIRA.") + # asf_jira is guaranteed to be set here: initialize_jira() fails fast otherwise. + continue_maybe("Would you like to update an associated JIRA?") + jira_comment = "Issue resolved by pull request %s\n[%s/%s]" % ( + pr_num, + GITHUB_BASE, + pr_num, + ) + resolve_jira_issues(title, merged_refs, jira_comment) if __name__ == "__main__":