From b9431ff12a4a4dd1f1f2786183e32081b78fcd9f Mon Sep 17 00:00:00 2001 From: Matthew Davis Date: Mon, 25 May 2026 12:08:14 +1000 Subject: [PATCH 1/5] Create raw_data_cache if it does not yet exist MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cherry-picked from #67 (commit 976b872). cache_compiler's contract is "build me a typed cache from these arguments" — making the user create the destination directory first is needless friction. If the path doesn't exist, create it. If it exists but as a file rather than a directory, that's clearly a typo, so raise UserInputError naming the path. Conflict resolution: the cherry-pick context assumed PR #67's full state including the None-check guard; master already has the None check (from PR #80) and an older error-message wording on the following line. Took PR #67's version of the isfile/makedirs block; the None check above it is unchanged from master. Scope is unchanged from PR #67 — dynamic_data_compiler and static_table still require the directory to exist (they're read-side operations; auto-creation only makes sense for cache_compiler). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/nemosis/data_fetch_methods.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/nemosis/data_fetch_methods.py b/src/nemosis/data_fetch_methods.py index 0aeb6f9..6cc68b5 100644 --- a/src/nemosis/data_fetch_methods.py +++ b/src/nemosis/data_fetch_methods.py @@ -195,7 +195,10 @@ def cache_compiler( raise UserInputError("The raw_data_location provided is None.") if not _os.path.isdir(raw_data_location): - raise UserInputError("The raw_data_location provided does not exist.") + if _os.path.isfile(raw_data_location): + raise UserInputError(f"The raw_data_location {raw_data_location} provided exists as a file, not directory.") + else: + _os.makedirs(raw_data_location) if table_name not in _defaults.dynamic_tables: raise UserInputError("Table name provided is not a dynamic table.") From 863c2d4058c6362268244a28178ec48c1347eb7d Mon Sep 17 00:00:00 2001 From: Matthew Davis Date: Wed, 8 Apr 2026 14:56:36 +0200 Subject: [PATCH 2/5] Set keep_csv=True by default for cache_compiler --- src/nemosis/data_fetch_methods.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nemosis/data_fetch_methods.py b/src/nemosis/data_fetch_methods.py index 6cc68b5..fe45145 100644 --- a/src/nemosis/data_fetch_methods.py +++ b/src/nemosis/data_fetch_methods.py @@ -156,7 +156,7 @@ def cache_compiler( select_columns=None, fformat="feather", rebuild=False, - keep_csv=False, + keep_csv=True, **kwargs, ): """ @@ -184,7 +184,7 @@ def cache_compiler( rebuild (bool): If True then cache files are rebuilt even if they exist already. False by default. keep_csv (bool): If True raw CSVs from AEMO are not deleted after - the cache is built. False by default + the cache is built. True by default **kwargs: additional arguments passed to the pd.to_{fformat}() function Returns: From e9594817c82128902747bd476b316b396054746f Mon Sep 17 00:00:00 2001 From: Matthew Davis Date: Wed, 8 Apr 2026 14:59:25 +0200 Subject: [PATCH 3/5] Tidy up incomplete feather/parquet (#55) --- src/nemosis/data_fetch_methods.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/nemosis/data_fetch_methods.py b/src/nemosis/data_fetch_methods.py index fe45145..d9a98e5 100644 --- a/src/nemosis/data_fetch_methods.py +++ b/src/nemosis/data_fetch_methods.py @@ -781,11 +781,17 @@ def _write_to_format(data, fformat, full_filename, write_kwargs): if _os.path.isfile(full_filename) and fformat != "csv": _os.unlink(full_filename) # Write to required format - if fformat == "feather": - write_function[fformat](full_filename, **write_kwargs) - elif fformat == "parquet": - write_function[fformat](full_filename, index=False, **write_kwargs) - return + try: + if fformat == "feather": + write_function[fformat](full_filename, **write_kwargs) + elif fformat == "parquet": + write_function[fformat](full_filename, index=False, **write_kwargs) + return + except Exception: + # tidy up incomplete file + if os.path.exists(full_filename): + os.remove(full_filename) + raise def _download_data( From 1066018debc967164438bd0dd6140d7ad4f08a12 Mon Sep 17 00:00:00 2001 From: nick-gorman Date: Mon, 25 May 2026 12:12:34 +1000 Subject: [PATCH 4/5] Fix os/_os shadowing in cleanup branch and add regression tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two related changes: 1. Source fix: the cherry-picked cleanup block in _write_to_format used bare 'os.path.exists' and 'os.remove', but data_fetch_methods.py imports the stdlib module as 'os as _os'. So the cleanup path raised NameError("name 'os' is not defined") whenever a feather/parquet write actually failed — silently regressing the issue #55 fix to worse-than-before, because the user-visible exception became NameError instead of the real write error, and the partial file was never cleaned up. Replaced with _os.path.isfile / _os.unlink to match the existing pre-write cleanup convention on lines 781-782 of the same function. 2. Regression tests in tests/end_to_end_table_tests/test_cache_compiler.py covering all three behaviours added in this PR plus the bug above: - test_creates_cache_directory_when_missing - test_raises_when_cache_path_is_a_file - test_keep_csv_true_by_default - test_write_to_format_cleans_up_partial_file_on_failure (parametrized over feather and parquet — both formats fail in the same way under the bug, both pass after the fix) The write-cleanup tests were authored before applying the source fix and confirmed to fail on the buggy version with the exact NameError from line 792, then to pass after the fix — so they are real regression coverage, not coincidental. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/nemosis/data_fetch_methods.py | 4 +- .../test_cache_compiler.py | 80 ++++++++++++++++++- 2 files changed, 79 insertions(+), 5 deletions(-) diff --git a/src/nemosis/data_fetch_methods.py b/src/nemosis/data_fetch_methods.py index d9a98e5..461e953 100644 --- a/src/nemosis/data_fetch_methods.py +++ b/src/nemosis/data_fetch_methods.py @@ -789,8 +789,8 @@ def _write_to_format(data, fformat, full_filename, write_kwargs): return except Exception: # tidy up incomplete file - if os.path.exists(full_filename): - os.remove(full_filename) + if _os.path.isfile(full_filename): + _os.unlink(full_filename) raise diff --git a/tests/end_to_end_table_tests/test_cache_compiler.py b/tests/end_to_end_table_tests/test_cache_compiler.py index ee715d0..e78cec5 100644 --- a/tests/end_to_end_table_tests/test_cache_compiler.py +++ b/tests/end_to_end_table_tests/test_cache_compiler.py @@ -2,13 +2,16 @@ that writes typed feather / parquet files to disk for downstream consumers. Verifies (a) the round-trip through cache → dynamic_data_compiler preserves -typed columns and (b) `select_columns` narrows the cached file itself, not -just the returned frame. +typed columns, (b) `select_columns` narrows the cached file itself, not +just the returned frame, (c) cache directory is auto-created when missing, +(d) keep_csv=True is the default behaviour, and (e) partial cache files +are cleaned up when a feather/parquet write fails mid-flight. """ import pandas as pd import pytest -from nemosis import cache_compiler, dynamic_data_compiler +from nemosis import cache_compiler, data_fetch_methods, dynamic_data_compiler +from nemosis.custom_errors import UserInputError START = "2018/05/01 00:00:00" @@ -57,3 +60,74 @@ def test_select_columns_narrows_cached_file(nemosis_fixture): for path in parquet_files: on_disk = pd.read_parquet(path) assert set(on_disk.columns) == {"SETTLEMENTDATE", "REGIONID"}, path + + +def test_creates_cache_directory_when_missing(nemosis_fixture): + """cache_compiler builds caches — making the user mkdir first is needless + friction. If the destination is missing, create it.""" + target = nemosis_fixture / "new_subdir_that_does_not_exist" + assert not target.exists() + + cache_compiler( + start_time=START, end_time=END, + table_name="DISPATCHPRICE", + raw_data_location=str(target), + ) + + assert target.is_dir() + assert list(target.glob("*.feather")), "cache should be populated" + + +def test_raises_when_cache_path_is_a_file(nemosis_fixture): + """A path that points at a regular file is clearly a typo, not a + cache directory to create — surface that as UserInputError.""" + not_a_dir = nemosis_fixture / "oops.txt" + not_a_dir.write_text("I am not a directory") + + with pytest.raises(UserInputError, match="exists as a file"): + cache_compiler( + start_time=START, end_time=END, + table_name="DISPATCHPRICE", + raw_data_location=str(not_a_dir), + ) + + +def test_keep_csv_true_by_default(nemosis_fixture): + """The default behaviour is to retain the raw AEMO CSVs alongside the + typed feather file so downstream consumers can re-read them without + re-fetching from AEMO.""" + cache_compiler( + start_time=START, end_time=END, + table_name="DISPATCHPRICE", + raw_data_location=str(nemosis_fixture), + # no keep_csv kwarg — use the default + ) + csv_files = list(nemosis_fixture.glob("*DISPATCHPRICE*.csv")) + assert csv_files, "default keep_csv=True should retain raw CSVs" + + +@pytest.mark.parametrize("fformat", ["feather", "parquet"]) +def test_write_to_format_cleans_up_partial_file_on_failure(tmp_path, monkeypatch, fformat): + """A mid-write failure (e.g. disk full) used to leave a partial, + unreadable feather/parquet on disk. Subsequent runs would then trip + on the corrupt file. _write_to_format now removes the partial file + in its except branch before re-raising the original exception.""" + target = tmp_path / f"x.{fformat}" + df = pd.DataFrame({"a": [1, 2, 3]}) + + method = "to_feather" if fformat == "feather" else "to_parquet" + + def fake_write(self, path, **kwargs): + # Simulate a partial write — bytes hit disk before the writer errors. + with open(path, "wb") as f: + f.write(b"partial-bytes-from-failed-write") + raise IOError("simulated disk full mid-write") + + monkeypatch.setattr(pd.DataFrame, method, fake_write) + + with pytest.raises(IOError, match="simulated disk full"): + data_fetch_methods._write_to_format(df, fformat, str(target), {}) + + assert not target.exists(), ( + f"partial {fformat} file should have been cleaned up after the write failure" + ) From 15bff82814020b50c19ce714aa7b5e6fb9395ee8 Mon Sep 17 00:00:00 2001 From: nick-gorman Date: Mon, 25 May 2026 12:27:59 +1000 Subject: [PATCH 5/5] Fix CI failure: case-insensitive CSV glob, explicit rebuild, semantic-lock test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI surfaced two issues with test_keep_csv_true_by_default that weren't visible on Windows local: 1. AEMO zips contain CSV files with an uppercase '.CSV' extension. The test globbed for '*DISPATCHPRICE*.csv', which is case-insensitive on Windows (passed locally) but case-sensitive on Linux/macOS (failed on CI). NEMOSIS itself handles this internally with [cC][sS][vV] patterns — the test now uses [Cc][Ss][Vv] to match. 2. The test relied on the tmp_path being empty so the CSV-fetch path would run by default. Made the intent explicit with rebuild=True so the test deterministically forces the CSV-fetch branch regardless of starting cache state. While restructuring, also added: - test_keep_csv_false_removes_fetched_csv: mirror of the above with the override, confirming the opt-out path still removes CSVs. - test_existing_feather_means_no_csv_is_fetched: locks in the user- clarified semantic — keep_csv=True is about "don't delete a CSV we fetched", NOT about "create a CSV when feather already exists". Pre-populates empty feather files at the expected names so the "already compiled" short-circuit fires, then asserts no CSV was created. In caching_mode the existence check skips the actual read, so empty feather files are fine for the gate. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../test_cache_compiler.py | 63 ++++++++++++++++--- 1 file changed, 56 insertions(+), 7 deletions(-) diff --git a/tests/end_to_end_table_tests/test_cache_compiler.py b/tests/end_to_end_table_tests/test_cache_compiler.py index e78cec5..181ec54 100644 --- a/tests/end_to_end_table_tests/test_cache_compiler.py +++ b/tests/end_to_end_table_tests/test_cache_compiler.py @@ -92,18 +92,67 @@ def test_raises_when_cache_path_is_a_file(nemosis_fixture): ) -def test_keep_csv_true_by_default(nemosis_fixture): - """The default behaviour is to retain the raw AEMO CSVs alongside the - typed feather file so downstream consumers can re-read them without - re-fetching from AEMO.""" +def test_keep_csv_true_by_default_keeps_fetched_csv(nemosis_fixture): + """When cache_compiler actually has to fetch (no existing feather, so + the code path that downloads + extracts a CSV runs), the default + keep_csv=True must leave the extracted CSV on disk alongside the + feather. rebuild=True forces the fetch path so this test isn't + dependent on tmp_path being empty at start. + + AEMO zips contain CSV files with an uppercase .CSV extension — + NEMOSIS handles this internally with [cC][sS][vV] globs and the + test does the same.""" cache_compiler( start_time=START, end_time=END, table_name="DISPATCHPRICE", raw_data_location=str(nemosis_fixture), - # no keep_csv kwarg — use the default + rebuild=True, + # no keep_csv kwarg — exercises the default + ) + csv_files = list(nemosis_fixture.glob("*DISPATCHPRICE*.[Cc][Ss][Vv]")) + assert csv_files, "default keep_csv=True should retain the fetched CSV" + + +def test_keep_csv_false_removes_fetched_csv(nemosis_fixture): + """Mirror of the above with the override — verifies the opt-out + path still works (the source-side delete in _dynamic_data_fetch_loop + fires only when keep_csv is False).""" + cache_compiler( + start_time=START, end_time=END, + table_name="DISPATCHPRICE", + raw_data_location=str(nemosis_fixture), + rebuild=True, + keep_csv=False, + ) + csv_files = list(nemosis_fixture.glob("*DISPATCHPRICE*.[Cc][Ss][Vv]")) + assert not csv_files, "keep_csv=False should remove the fetched CSV" + + +def test_existing_feather_means_no_csv_is_fetched(nemosis_fixture): + """If the feather is already in the cache, cache_compiler must take + the "already compiled" short-circuit and not fetch a CSV — keep_csv + is only about retaining a CSV we actually downloaded, not about + creating one out of thin air. Pre-populate empty feather files at + the expected filenames (in caching_mode the existence check skips + the read), call cache_compiler without rebuild, and verify no CSV + appeared.""" + # April + May because NEMOSIS uses a 1-day buffer-back, so a query + # starting 2018-05-01 also scans the 2018-04 archive. + for month in ("201804", "201805"): + (nemosis_fixture / f"PUBLIC_DVD_DISPATCHPRICE_{month}010000.feather").touch() + + cache_compiler( + start_time=START, end_time=END, + table_name="DISPATCHPRICE", + raw_data_location=str(nemosis_fixture), + # default keep_csv=True — would matter if a CSV was fetched + ) + + csv_files = list(nemosis_fixture.glob("*DISPATCHPRICE*.[Cc][Ss][Vv]")) + assert not csv_files, ( + "keep_csv=True should NOT cause a CSV to be created when the " + "feather already exists — the CSV branch must not run at all" ) - csv_files = list(nemosis_fixture.glob("*DISPATCHPRICE*.csv")) - assert csv_files, "default keep_csv=True should retain raw CSVs" @pytest.mark.parametrize("fformat", ["feather", "parquet"])