Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions src/nemosis/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,13 @@ def download_to_dir(url, down_load_to, force_redo=False):

Streams the response so large files don't have to fit in memory.
"""
url = url.replace('#', '%23')
# Post-2024-07 AEMO archive files are stored on nemweb with literal
# `%23` in their on-disk filenames (not `#`). To match, the URL must
# contain `%2523` so nemweb decodes it once to `%23` and finds the
# file. A single `%23` would decode to `#` and 400. Pre-Aug-2024
# PUBLIC_DVD_* filenames don't contain `#`, so the replace is a
# no-op for the older path. See issue #74.
url = url.replace('#', '%2523')
filename = url.split('/')[-1].split('?')[0]
path = os.path.join(down_load_to, filename)
downloaded = download_to_path(url, path, force_redo=force_redo)
Expand All @@ -370,7 +376,11 @@ def download_to_path(url, path_and_name, force_redo=False):
mid-stream, the partial output file is removed before the
exception propagates.
"""
url = url.replace('#', '%23')
# See `download_to_dir` for why this is `%2523` and not `%23`.
# Repeated here because `download_to_path` is also called directly
# (e.g. from `download_csv`); the replace is idempotent (`%2523`
# contains no `#`) so double-encoding via `download_to_dir` is safe.
url = url.replace('#', '%2523')
if os.path.isfile(path_and_name) and not force_redo:
return False

Expand Down
16 changes: 14 additions & 2 deletions tests/fixtures/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,12 @@

def http_get(url: str) -> requests.Response:
log.info("GET %s", url)
r = requests.get(url.replace("#", "%23"), headers=USR_AGENT, timeout=180)
# Post-2024-07 PUBLIC_ARCHIVE# files are stored on nemweb with literal
# `%23` in their on-disk filenames. The HTTP URL must contain `%2523`
# so nemweb decodes it once to `%23` and finds the file. A single
# `%23` would decode to `#` and 400. Mirrors the fix in
# nemosis.downloader.download_unzip_csv (issue #74).
r = requests.get(url.replace("#", "%2523"), headers=USR_AGENT, timeout=180)
r.raise_for_status()
return r

Expand Down Expand Up @@ -140,13 +145,20 @@ def mms_filename(table: str, era_date: date, chunk: int) -> str:

def mms_fixture_path(table: str, era_date: date, chunk: int) -> Path:
year, month = era_date.year, f"{era_date.month:02d}"
# Post-2024-07 archives are stored on nemweb with literal `%23` in
# the filename (not `#`). Mirror that on disk so the offline mock
# server serves files under the same name the real server uses —
# otherwise NEMOSIS's now-`%2523`-encoded URLs won't match. Pre-Aug-2024
# PUBLIC_DVD_* filenames don't contain `#`, so the replace is a no-op
# there. See issue #74.
on_disk_name = mms_filename(table, era_date, chunk).replace("#", "%23")
return (
FIXTURE_ROOT
/ "Data_Archive/Wholesale_Electricity/MMSDM"
/ str(year)
/ f"MMSDM_{year}_{month}"
/ "MMSDM_Historical_Data_SQLLoader/DATA"
/ mms_filename(table, era_date, chunk)
/ on_disk_name
)


Expand Down
Loading