From 13c29864ae935142afc9c823373a8dffe5f12dce Mon Sep 17 00:00:00 2001 From: Luna Date: Tue, 21 Apr 2026 00:06:32 +0200 Subject: [PATCH] ci: batch wiki queries --- .github/scripts/updateWikiLinks.py | 166 ++++++++++++++++++++++------- 1 file changed, 126 insertions(+), 40 deletions(-) diff --git a/.github/scripts/updateWikiLinks.py b/.github/scripts/updateWikiLinks.py index 2aa53319a3..43b6a752bd 100644 --- a/.github/scripts/updateWikiLinks.py +++ b/.github/scripts/updateWikiLinks.py @@ -47,6 +47,86 @@ def _escape_wiki_links(links: list[str]) -> list[str]: links[i] = link.replace("=", "\\u003d").replace("'", "\\u0027") return links +def _page_title_from_url(pageUrl: str) -> tuple[str, str]: + parsed = urlparse(pageUrl) + apiUrl = f"https://{parsed.netloc}/api.php" + pageTitle = unquote(parsed.path.removeprefix("/").removeprefix("w/").removeprefix("wiki/")) + return apiUrl, pageTitle + +def _batch_page_existence_requests(pageUrls: list[str]): + processed = 0 + for i in range(0, len(pageUrls), 50): + batch = pageUrls[i:i + 50] + apiUrl, _ = _page_title_from_url(batch[0]) + hostname = urlparse(apiUrl).netloc + titlesByUrl = {} + titles = [] + print(f"Performing batch page lookup for {hostname} ({processed:,}/{len(pageUrls):,})") + for pageUrl in batch: + batchApiUrl, pageTitle = _page_title_from_url(pageUrl) + if batchApiUrl != apiUrl: + raise ValueError("Batch contains mixed wiki hosts") + titlesByUrl[pageUrl] = pageTitle + titles.append(pageTitle) + processed += len(batch) + + response = httpPool.request( + "GET", + apiUrl, + fields={ + "action": "query", + "format": "json", + "titles": "|".join(titles), + "redirects": 1, + "formatversion": 2, + }, + ) + + if response.status != 200: + print(f"Failed to fetch batch from {apiUrl} ({response.status})") + for pageUrl in batch: + attemptedLinks[pageUrl] = False + continue + + payload = json.loads(response.data.decode("utf-8")) + titleStatuses = {} + for page in payload.get("query", {}).get("pages", []): + titleStatuses[page["title"]] = "missing" not in page + + for redirect in payload.get("query", {}).get("redirects", []): + titleStatuses[redirect["from"]] = titleStatuses.get(redirect["to"], False) + + for normalized in payload.get("query", {}).get("normalized", []): + titleStatuses[normalized["from"]] = titleStatuses.get(normalized["to"], False) + + for pageUrl, pageTitle in titlesByUrl.items(): + attemptedLinks[pageUrl] = titleStatuses.get(pageTitle, False) + +def _prime_page_existence_cache(pageUrls: list[str]): + uniqueUrls = list(dict.fromkeys(pageUrls)) + urlsByApiUrl = {} + for pageUrl in uniqueUrls: + apiUrl, _ = _page_title_from_url(pageUrl) + urlsByApiUrl.setdefault(apiUrl, []).append(pageUrl) + + for _, urls in urlsByApiUrl.items(): + _batch_page_existence_requests(urls) + +def _candidate_page_urls(formattedName: str) -> list[str]: + urls = [ + unofficialLink + modifyUnofficialItem(formattedName), + officialLink + modifyOfficialItem(formattedName), + ] + + if "_Of_" in formattedName or "_The_" in formattedName or "_To_" in formattedName: + formattedName_lower_prepositions = _replace_title_case_prepositions(formattedName) + urls.extend([ + unofficialLink + modifyUnofficialItem(formattedName_lower_prepositions), + officialLink + modifyOfficialItem(formattedName_lower_prepositions), + ]) + + return urls + def _update_special_case_links(filename: str, jsonData: dict, file, desired_links: list[str]) -> bool: global modifiedCount desired_links = _escape_wiki_links(desired_links) @@ -66,6 +146,30 @@ def _update_special_case_links(filename: str, jsonData: dict, file, desired_link json.dump(jsonData, file, indent=2, ensure_ascii=False) return file_modified +def _has_complete_links(existingInfo: dict) -> bool: + validLinks = [link for link in existingInfo if link.startswith(unofficialLink) or link.startswith(officialLink)] + return validLinks and existingInfo == validLinks + +def _should_skip_for_lookup(filename: str, jsonData: dict) -> bool: + if ( + ("vanilla" in jsonData + or jsonData["itemid"] == "minecraft:enchanted_book" + or jsonData["itemid"] == "minecraft:potion") + ): + return True + + if filename.startswith('⚚_') or filename.startswith('ATTRIBUTE_'): + return True + + if filename.startswith('BALLOON_HAT_2024') or filename.startswith('BALLOON_HAT_2025'): + return True + + existingInfo = jsonData.get("info", []) + if _has_complete_links(existingInfo): + return True + + return False + def processItemFile(filename: str): global modifiedCount, badModifiedCount @@ -102,30 +206,28 @@ def processItemFile(filename: str): if _update_special_case_links(filename, jsonData, file, desired_links): return - validLinks = [link for link in existingInfo if link.startswith(unofficialLink) or link.startswith(officialLink)] - if validLinks and existingInfo == validLinks: + if _has_complete_links(existingInfo): return print(f"Processing {filename}...") formattedName = formatNameForSearch(jsonData["displayname"]) + candidateUrls = _candidate_page_urls(formattedName) # Attempt to find Unofficial and Official wiki links - fullUnofficialLink = unofficialLink + modifyUnofficialItem(formattedName) - fullOfficialLink = officialLink + modifyOfficialItem(formattedName) + fullUnofficialLink = candidateUrls[0] + fullOfficialLink = candidateUrls[1] unofficialExists = doesPageExist(fullUnofficialLink) officialExists = doesPageExist(fullOfficialLink) # Try with lowercase prepositions if initial attempt fails - if not unofficialExists and ("_Of_" in formattedName or "_The_" in formattedName or "_To_" in formattedName): - formattedName_lower_prepositions = _replace_title_case_prepositions(formattedName) - fullUnofficialLink = unofficialLink + modifyUnofficialItem(formattedName_lower_prepositions) + if not unofficialExists and len(candidateUrls) > 2: + fullUnofficialLink = candidateUrls[2] unofficialExists = doesPageExist(fullUnofficialLink) - if not officialExists and ("_Of_" in formattedName or "_The_" in formattedName or "_To_" in formattedName): - formattedName_lower_prepositions = _replace_title_case_prepositions(formattedName) - fullOfficialLink = officialLink + modifyOfficialItem(formattedName_lower_prepositions) + if not officialExists and len(candidateUrls) > 3: + fullOfficialLink = candidateUrls[3] officialExists = doesPageExist(fullOfficialLink) fileModified = False @@ -212,36 +314,7 @@ def removeColourCodes(string: str) -> str: def doesPageExist(pageUrl: str) -> bool: - if pageUrl in attemptedLinks: - return attemptedLinks[pageUrl] - - parsed = urlparse(pageUrl) - apiUrl = f"https://{parsed.netloc}/api.php" - pageTitle = unquote(parsed.path.removeprefix("/").removeprefix("w/").removeprefix("wiki/")) - - response = httpPool.request( - "GET", - apiUrl, - fields={ - "action": "query", - "format": "json", - "titles": pageTitle, - "redirects": 1, - "formatversion": 2, - }, - ) - - if response.status != 200: - print(f"Failed to fetch {pageUrl} ({response.status})") - attemptedLinks[pageUrl] = False - return False - - payload = json.loads(response.data.decode("utf-8")) - pages = payload.get("query", {}).get("pages", []) - success = pages and "missing" not in pages[0] - - attemptedLinks[pageUrl] = success - return success + return attemptedLinks.get(pageUrl, False) def capitalizeWords(string: str) -> str: @@ -285,6 +358,19 @@ def modifyOfficialItem(officialItem: str) -> str: print("Starting item file processing...") jsonFiles = getItemFiles() + lookupUrls = [] + for item in jsonFiles: + filePath = os.path.join(itemsDirectory, item) + with open(filePath, 'r', encoding='utf-8') as file: + jsonData = json.load(file) + if _should_skip_for_lookup(item, jsonData): + continue + + formattedName = formatNameForSearch(jsonData["displayname"]) + lookupUrls.extend(_candidate_page_urls(formattedName)) + + _prime_page_existence_cache(lookupUrls) + for item in jsonFiles: processItemFile(item)