Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
166 changes: 126 additions & 40 deletions .github/scripts/updateWikiLinks.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,86 @@ def _escape_wiki_links(links: list[str]) -> list[str]:
links[i] = link.replace("=", "\\u003d").replace("'", "\\u0027")
return links

def _page_title_from_url(pageUrl: str) -> tuple[str, str]:
parsed = urlparse(pageUrl)
apiUrl = f"https://{parsed.netloc}/api.php"
pageTitle = unquote(parsed.path.removeprefix("/").removeprefix("w/").removeprefix("wiki/"))
return apiUrl, pageTitle

def _batch_page_existence_requests(pageUrls: list[str]):
processed = 0
for i in range(0, len(pageUrls), 50):
batch = pageUrls[i:i + 50]
apiUrl, _ = _page_title_from_url(batch[0])
hostname = urlparse(apiUrl).netloc
titlesByUrl = {}
titles = []
print(f"Performing batch page lookup for {hostname} ({processed:,}/{len(pageUrls):,})")
for pageUrl in batch:
batchApiUrl, pageTitle = _page_title_from_url(pageUrl)
if batchApiUrl != apiUrl:
raise ValueError("Batch contains mixed wiki hosts")
titlesByUrl[pageUrl] = pageTitle
titles.append(pageTitle)
processed += len(batch)

response = httpPool.request(
"GET",
apiUrl,
fields={
"action": "query",
"format": "json",
"titles": "|".join(titles),
"redirects": 1,
"formatversion": 2,
},
)

if response.status != 200:
print(f"Failed to fetch batch from {apiUrl} ({response.status})")
for pageUrl in batch:
attemptedLinks[pageUrl] = False
continue

payload = json.loads(response.data.decode("utf-8"))
titleStatuses = {}
for page in payload.get("query", {}).get("pages", []):
titleStatuses[page["title"]] = "missing" not in page

for redirect in payload.get("query", {}).get("redirects", []):
titleStatuses[redirect["from"]] = titleStatuses.get(redirect["to"], False)

for normalized in payload.get("query", {}).get("normalized", []):
titleStatuses[normalized["from"]] = titleStatuses.get(normalized["to"], False)

for pageUrl, pageTitle in titlesByUrl.items():
attemptedLinks[pageUrl] = titleStatuses.get(pageTitle, False)

def _prime_page_existence_cache(pageUrls: list[str]):
uniqueUrls = list(dict.fromkeys(pageUrls))
urlsByApiUrl = {}
for pageUrl in uniqueUrls:
apiUrl, _ = _page_title_from_url(pageUrl)
urlsByApiUrl.setdefault(apiUrl, []).append(pageUrl)

for _, urls in urlsByApiUrl.items():
_batch_page_existence_requests(urls)

def _candidate_page_urls(formattedName: str) -> list[str]:
urls = [
unofficialLink + modifyUnofficialItem(formattedName),
officialLink + modifyOfficialItem(formattedName),
]

if "_Of_" in formattedName or "_The_" in formattedName or "_To_" in formattedName:
formattedName_lower_prepositions = _replace_title_case_prepositions(formattedName)
urls.extend([
unofficialLink + modifyUnofficialItem(formattedName_lower_prepositions),
officialLink + modifyOfficialItem(formattedName_lower_prepositions),
])

return urls

def _update_special_case_links(filename: str, jsonData: dict, file, desired_links: list[str]) -> bool:
global modifiedCount
desired_links = _escape_wiki_links(desired_links)
Expand All @@ -66,6 +146,30 @@ def _update_special_case_links(filename: str, jsonData: dict, file, desired_link
json.dump(jsonData, file, indent=2, ensure_ascii=False)
return file_modified

def _has_complete_links(existingInfo: dict) -> bool:
validLinks = [link for link in existingInfo if link.startswith(unofficialLink) or link.startswith(officialLink)]
return validLinks and existingInfo == validLinks

def _should_skip_for_lookup(filename: str, jsonData: dict) -> bool:
if (
("vanilla" in jsonData
or jsonData["itemid"] == "minecraft:enchanted_book"
or jsonData["itemid"] == "minecraft:potion")
):
return True

if filename.startswith('⚚_') or filename.startswith('ATTRIBUTE_'):
return True

if filename.startswith('BALLOON_HAT_2024') or filename.startswith('BALLOON_HAT_2025'):
return True

existingInfo = jsonData.get("info", [])
if _has_complete_links(existingInfo):
return True

return False


def processItemFile(filename: str):
global modifiedCount, badModifiedCount
Expand Down Expand Up @@ -102,30 +206,28 @@ def processItemFile(filename: str):
if _update_special_case_links(filename, jsonData, file, desired_links):
return

validLinks = [link for link in existingInfo if link.startswith(unofficialLink) or link.startswith(officialLink)]
if validLinks and existingInfo == validLinks:
if _has_complete_links(existingInfo):
return

print(f"Processing {filename}...")

formattedName = formatNameForSearch(jsonData["displayname"])
candidateUrls = _candidate_page_urls(formattedName)

# Attempt to find Unofficial and Official wiki links
fullUnofficialLink = unofficialLink + modifyUnofficialItem(formattedName)
fullOfficialLink = officialLink + modifyOfficialItem(formattedName)
fullUnofficialLink = candidateUrls[0]
fullOfficialLink = candidateUrls[1]

unofficialExists = doesPageExist(fullUnofficialLink)
officialExists = doesPageExist(fullOfficialLink)

# Try with lowercase prepositions if initial attempt fails
if not unofficialExists and ("_Of_" in formattedName or "_The_" in formattedName or "_To_" in formattedName):
formattedName_lower_prepositions = _replace_title_case_prepositions(formattedName)
fullUnofficialLink = unofficialLink + modifyUnofficialItem(formattedName_lower_prepositions)
if not unofficialExists and len(candidateUrls) > 2:
fullUnofficialLink = candidateUrls[2]
unofficialExists = doesPageExist(fullUnofficialLink)

if not officialExists and ("_Of_" in formattedName or "_The_" in formattedName or "_To_" in formattedName):
formattedName_lower_prepositions = _replace_title_case_prepositions(formattedName)
fullOfficialLink = officialLink + modifyOfficialItem(formattedName_lower_prepositions)
if not officialExists and len(candidateUrls) > 3:
fullOfficialLink = candidateUrls[3]
officialExists = doesPageExist(fullOfficialLink)

fileModified = False
Expand Down Expand Up @@ -212,36 +314,7 @@ def removeColourCodes(string: str) -> str:


def doesPageExist(pageUrl: str) -> bool:
if pageUrl in attemptedLinks:
return attemptedLinks[pageUrl]

parsed = urlparse(pageUrl)
apiUrl = f"https://{parsed.netloc}/api.php"
pageTitle = unquote(parsed.path.removeprefix("/").removeprefix("w/").removeprefix("wiki/"))

response = httpPool.request(
"GET",
apiUrl,
fields={
"action": "query",
"format": "json",
"titles": pageTitle,
"redirects": 1,
"formatversion": 2,
},
)

if response.status != 200:
print(f"Failed to fetch {pageUrl} ({response.status})")
attemptedLinks[pageUrl] = False
return False

payload = json.loads(response.data.decode("utf-8"))
pages = payload.get("query", {}).get("pages", [])
success = pages and "missing" not in pages[0]

attemptedLinks[pageUrl] = success
return success
return attemptedLinks.get(pageUrl, False)


def capitalizeWords(string: str) -> str:
Expand Down Expand Up @@ -285,6 +358,19 @@ def modifyOfficialItem(officialItem: str) -> str:
print("Starting item file processing...")
jsonFiles = getItemFiles()

lookupUrls = []
for item in jsonFiles:
filePath = os.path.join(itemsDirectory, item)
with open(filePath, 'r', encoding='utf-8') as file:
jsonData = json.load(file)
if _should_skip_for_lookup(item, jsonData):
continue

formattedName = formatNameForSearch(jsonData["displayname"])
lookupUrls.extend(_candidate_page_urls(formattedName))

_prime_page_existence_cache(lookupUrls)

for item in jsonFiles:
processItemFile(item)

Expand Down
Loading