|
| 1 | +"""Chunked-VRT coverage for ``missing_sources`` (issue #1799). |
| 2 | +
|
| 3 | +``test_vrt_missing_sources_policy_1799`` covers the eager (non-chunked) |
| 4 | +``read_vrt`` path. The chunked path (``read_vrt(chunks=N)``, dispatching |
| 5 | +through ``_read_vrt_chunked``) plumbs ``missing_sources`` separately: |
| 6 | +
|
| 7 | +* Parse-time approximation: a static ``os.path.exists`` sweep over every |
| 8 | + source populates ``attrs['vrt_holes']`` on the returned DataArray |
| 9 | + before any decode work starts (docstring in ``_backends/vrt.py:344``). |
| 10 | +* Decode-time: each per-chunk task receives ``missing_sources`` and the |
| 11 | + internal reader applies the same warn/raise policy as the eager path. |
| 12 | +
|
| 13 | +A regression dropping either the parse-time sweep or the per-chunk |
| 14 | +forward would silently change the contract: |
| 15 | +
|
| 16 | +* ``vrt_holes`` would disappear from the lazy build, breaking callers |
| 17 | + that branch on ``"vrt_holes" in da.attrs`` to detect partial mosaics |
| 18 | + before scheduling a compute (the contract documented in #1734). |
| 19 | +* ``missing_sources='raise'`` could silently degrade to ``'warn'`` (or |
| 20 | + vice versa) on the chunked path while the eager path stays correct. |
| 21 | +
|
| 22 | +This module pins both invariants. Tests use a 2-source mosaic where one |
| 23 | +source is missing on disk; the present source covers one chunk window |
| 24 | +and the missing source covers another, so the warn/raise policy is |
| 25 | +exercised against a non-trivial graph. |
| 26 | +""" |
| 27 | +from __future__ import annotations |
| 28 | + |
| 29 | +import os |
| 30 | +import warnings |
| 31 | + |
| 32 | +import numpy as np |
| 33 | +import pytest |
| 34 | +import xarray as xr |
| 35 | + |
| 36 | +from xrspatial.geotiff import GeoTIFFFallbackWarning, read_vrt, to_geotiff |
| 37 | + |
| 38 | + |
| 39 | +def _make_partial_vrt(tmp_path) -> tuple[str, str]: |
| 40 | + """Build a 2-source VRT with one present + one missing source. |
| 41 | +
|
| 42 | + Returns ``(vrt_path, present_src_path)``. The VRT references the |
| 43 | + present source for the left half and a non-existent file for the |
| 44 | + right half, so chunked reads against the right half hit the |
| 45 | + missing-source decode path. |
| 46 | + """ |
| 47 | + src = os.path.join(tmp_path, "src_present.tif") |
| 48 | + arr = np.full((4, 4), 7.0, dtype=np.float32) |
| 49 | + da = xr.DataArray( |
| 50 | + arr, dims=("y", "x"), |
| 51 | + attrs={"transform": (1.0, 0.0, 0.0, 0.0, -1.0, 0.0)}, |
| 52 | + ) |
| 53 | + to_geotiff(da, src) |
| 54 | + |
| 55 | + missing = os.path.join(tmp_path, "missing.tif") |
| 56 | + vrt_path = os.path.join(tmp_path, "partial.vrt") |
| 57 | + with open(vrt_path, "w") as f: |
| 58 | + f.write( |
| 59 | + f'<VRTDataset rasterXSize="8" rasterYSize="4">\n' |
| 60 | + '<GeoTransform>0.0, 1.0, 0.0, 0.0, 0.0, -1.0</GeoTransform>\n' |
| 61 | + '<VRTRasterBand dataType="Float32" band="1">\n' |
| 62 | + '<SimpleSource>\n' |
| 63 | + f'<SourceFilename relativeToVRT="0">{src}</SourceFilename>\n' |
| 64 | + '<SourceBand>1</SourceBand>\n' |
| 65 | + '<SrcRect xOff="0" yOff="0" xSize="4" ySize="4"/>\n' |
| 66 | + '<DstRect xOff="0" yOff="0" xSize="4" ySize="4"/>\n' |
| 67 | + '</SimpleSource>\n' |
| 68 | + '<SimpleSource>\n' |
| 69 | + f'<SourceFilename relativeToVRT="0">{missing}</SourceFilename>\n' |
| 70 | + '<SourceBand>1</SourceBand>\n' |
| 71 | + '<SrcRect xOff="0" yOff="0" xSize="4" ySize="4"/>\n' |
| 72 | + '<DstRect xOff="4" yOff="0" xSize="4" ySize="4"/>\n' |
| 73 | + '</SimpleSource>\n' |
| 74 | + '</VRTRasterBand>\n' |
| 75 | + '</VRTDataset>\n' |
| 76 | + ) |
| 77 | + return vrt_path, src |
| 78 | + |
| 79 | + |
| 80 | +class TestChunkedMissingSourcesWarn: |
| 81 | + """``read_vrt(chunks=N, missing_sources='warn')`` records holes at build. |
| 82 | +
|
| 83 | + The eager path scans every source at decode time. The chunked path |
| 84 | + cannot afford that sweep up front (it would defeat the lazy graph), |
| 85 | + so it uses ``os.path.exists`` to populate ``vrt_holes`` at build |
| 86 | + time. The compute step still emits per-task warnings for any |
| 87 | + missing source that survives. |
| 88 | + """ |
| 89 | + |
| 90 | + def test_vrt_holes_populated_at_build(self, tmp_path): |
| 91 | + vrt_path, _ = _make_partial_vrt(str(tmp_path)) |
| 92 | + result = read_vrt(vrt_path, chunks=4, missing_sources="warn") |
| 93 | + assert "vrt_holes" in result.attrs, ( |
| 94 | + "Chunked path must populate vrt_holes at build time so " |
| 95 | + "callers can detect partial mosaics without forcing a " |
| 96 | + "compute (issue #1734)." |
| 97 | + ) |
| 98 | + holes = result.attrs["vrt_holes"] |
| 99 | + assert len(holes) == 1 |
| 100 | + assert holes[0]["source"].endswith("missing.tif") |
| 101 | + |
| 102 | + def test_compute_emits_per_task_warning(self, tmp_path): |
| 103 | + vrt_path, _ = _make_partial_vrt(str(tmp_path)) |
| 104 | + with warnings.catch_warnings(record=True) as caught: |
| 105 | + warnings.simplefilter("always") |
| 106 | + result = read_vrt(vrt_path, chunks=4, missing_sources="warn") |
| 107 | + computed = result.compute() |
| 108 | + messages = [str(w.message) for w in caught |
| 109 | + if isinstance(w.message, GeoTIFFFallbackWarning)] |
| 110 | + assert any("missing.tif" in msg for msg in messages), ( |
| 111 | + f"Expected GeoTIFFFallbackWarning naming the missing " |
| 112 | + f"source after compute, got messages: {messages!r}" |
| 113 | + ) |
| 114 | + # Present-source chunk decodes its 7.0 fill; missing-source |
| 115 | + # chunk decodes to the source dtype's default fill (typically |
| 116 | + # zero for float32). Pin the present-side decode value so a |
| 117 | + # regression in the lenient path that wiped both halves would |
| 118 | + # surface. |
| 119 | + np.testing.assert_array_equal( |
| 120 | + np.asarray(computed)[:, :4], np.full((4, 4), 7.0, dtype=np.float32), |
| 121 | + ) |
| 122 | + |
| 123 | + def test_chunks_tuple_form(self, tmp_path): |
| 124 | + """Tuple ``chunks=(h, w)`` threads through identically.""" |
| 125 | + vrt_path, _ = _make_partial_vrt(str(tmp_path)) |
| 126 | + result = read_vrt( |
| 127 | + vrt_path, chunks=(2, 4), missing_sources="warn", |
| 128 | + ) |
| 129 | + assert "vrt_holes" in result.attrs |
| 130 | + # 2 chunks vertically * 2 chunks horizontally = 4 tasks. |
| 131 | + # The missing source is in column 1 (cols 4-7); only the right |
| 132 | + # half should produce warning records, but vrt_holes is a |
| 133 | + # parse-time sweep so it records the source once regardless. |
| 134 | + assert len(result.attrs["vrt_holes"]) == 1 |
| 135 | + |
| 136 | + |
| 137 | +class TestChunkedMissingSourcesRaise: |
| 138 | + """``read_vrt(chunks=N, missing_sources='raise')`` fails on compute. |
| 139 | +
|
| 140 | + The eager path raises at read time. The chunked path defers to |
| 141 | + compute because each chunk's decode is delayed; an upfront raise |
| 142 | + would force the parse-time sweep to decode every source, defeating |
| 143 | + the lazy graph. The contract: chunks intersecting a missing source |
| 144 | + raise on compute; chunks intersecting only present sources still |
| 145 | + succeed. |
| 146 | + """ |
| 147 | + |
| 148 | + def test_compute_intersecting_missing_raises(self, tmp_path): |
| 149 | + vrt_path, _ = _make_partial_vrt(str(tmp_path)) |
| 150 | + result = read_vrt(vrt_path, chunks=4, missing_sources="raise") |
| 151 | + # Build does not raise (the graph is lazy). |
| 152 | + # Computing a chunk that intersects the missing source raises. |
| 153 | + with pytest.raises((OSError, FileNotFoundError, ValueError)): |
| 154 | + result.compute() |
| 155 | + |
| 156 | + def test_compute_present_only_chunk_succeeds(self, tmp_path): |
| 157 | + """A windowed compute against only the present source succeeds. |
| 158 | +
|
| 159 | + ``read_vrt(window=...)`` restricts the chunked graph to the |
| 160 | + windowed extent; if the window misses the missing source, no |
| 161 | + chunk needs to decode it and compute succeeds even under |
| 162 | + ``missing_sources='raise'``. The contract: the raise policy is |
| 163 | + scoped to chunks that actually touch missing sources. |
| 164 | + """ |
| 165 | + vrt_path, _ = _make_partial_vrt(str(tmp_path)) |
| 166 | + # Window covers only the present source (cols 0-4). |
| 167 | + result = read_vrt( |
| 168 | + vrt_path, chunks=4, window=(0, 0, 4, 4), |
| 169 | + missing_sources="raise", |
| 170 | + ) |
| 171 | + computed = result.compute() |
| 172 | + np.testing.assert_array_equal( |
| 173 | + np.asarray(computed), np.full((4, 4), 7.0, dtype=np.float32), |
| 174 | + ) |
| 175 | + |
| 176 | + |
| 177 | +class TestChunkedMissingSourcesDefault: |
| 178 | + """The default ``missing_sources`` on chunked reads is ``'raise'``. |
| 179 | +
|
| 180 | + The public ``read_vrt`` default flipped to ``'raise'`` in #1843 / |
| 181 | + #1860. The chunked path goes through the same entry point so the |
| 182 | + default must agree. A regression flipping the chunked default to |
| 183 | + ``'warn'`` would silently produce partial mosaics for callers who |
| 184 | + don't pass the kwarg. |
| 185 | + """ |
| 186 | + |
| 187 | + def test_chunked_default_raises_on_compute(self, tmp_path): |
| 188 | + vrt_path, _ = _make_partial_vrt(str(tmp_path)) |
| 189 | + result = read_vrt(vrt_path, chunks=4) |
| 190 | + with pytest.raises((OSError, FileNotFoundError, ValueError)): |
| 191 | + result.compute() |
| 192 | + |
| 193 | + |
| 194 | +class TestChunkedMissingSourcesValidation: |
| 195 | + """Invalid ``missing_sources`` policies are rejected at entry.""" |
| 196 | + |
| 197 | + def test_invalid_policy_raises_at_build(self, tmp_path): |
| 198 | + vrt_path, _ = _make_partial_vrt(str(tmp_path)) |
| 199 | + with pytest.raises(ValueError, match="missing_sources"): |
| 200 | + read_vrt(vrt_path, chunks=4, missing_sources="ignore") |
| 201 | + |
| 202 | + def test_invalid_policy_raises_without_chunks_too(self, tmp_path): |
| 203 | + """Sanity: the eager path also rejects the bad value. Pinning |
| 204 | + cross-mode parity means callers see the same error whether or |
| 205 | + not they pass ``chunks=``.""" |
| 206 | + vrt_path, _ = _make_partial_vrt(str(tmp_path)) |
| 207 | + with pytest.raises(ValueError, match="missing_sources"): |
| 208 | + read_vrt(vrt_path, missing_sources="ignore") |
0 commit comments