Skip to content

Commit 80ec851

Browse files
committed
geotiff: chunked-VRT coverage for missing_sources policy (#1799)
The existing test_vrt_missing_sources_policy_1799 covers the eager read_vrt path but not the chunked path. The chunked path (_backends/vrt.py:_read_vrt_chunked) plumbs missing_sources through two separate code paths: - A parse-time static os.path.exists sweep at vrt.py:595-621 that populates attrs['vrt_holes'] at build time without forcing a compute (the contract documented in #1734 for partial-mosaic detection on the lazy graph). - Per-chunk forwarding of missing_sources to _vrt_chunk_read so each task applies the same warn/raise policy as the eager path. A regression dropping the parse-time sweep would break partial-mosaic detection on the lazy build; a regression dropping the per-chunk forward would silently degrade missing_sources='raise' to 'warn' (or vice versa) on the chunked path. Adds 8 tests, all passing: - missing_sources='warn' populates vrt_holes at build + emits warning on compute + present-source chunk decodes correctly - chunks=(h, w) tuple form threads through identically - missing_sources='raise' fails at compute on chunks touching missing sources but succeeds on windowed compute that misses them - default missing_sources is 'raise' on the chunked path (parity with eager default flipped in #1843 / #1860) - invalid policy raises ValueError at build, parity with eager path Mutation against attrs['vrt_holes'] population at vrt.py:620-621 flipped the two build-time tests red.
1 parent 750dc20 commit 80ec851

1 file changed

Lines changed: 208 additions & 0 deletions

File tree

Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
"""Chunked-VRT coverage for ``missing_sources`` (issue #1799).
2+
3+
``test_vrt_missing_sources_policy_1799`` covers the eager (non-chunked)
4+
``read_vrt`` path. The chunked path (``read_vrt(chunks=N)``, dispatching
5+
through ``_read_vrt_chunked``) plumbs ``missing_sources`` separately:
6+
7+
* Parse-time approximation: a static ``os.path.exists`` sweep over every
8+
source populates ``attrs['vrt_holes']`` on the returned DataArray
9+
before any decode work starts (docstring in ``_backends/vrt.py:344``).
10+
* Decode-time: each per-chunk task receives ``missing_sources`` and the
11+
internal reader applies the same warn/raise policy as the eager path.
12+
13+
A regression dropping either the parse-time sweep or the per-chunk
14+
forward would silently change the contract:
15+
16+
* ``vrt_holes`` would disappear from the lazy build, breaking callers
17+
that branch on ``"vrt_holes" in da.attrs`` to detect partial mosaics
18+
before scheduling a compute (the contract documented in #1734).
19+
* ``missing_sources='raise'`` could silently degrade to ``'warn'`` (or
20+
vice versa) on the chunked path while the eager path stays correct.
21+
22+
This module pins both invariants. Tests use a 2-source mosaic where one
23+
source is missing on disk; the present source covers one chunk window
24+
and the missing source covers another, so the warn/raise policy is
25+
exercised against a non-trivial graph.
26+
"""
27+
from __future__ import annotations
28+
29+
import os
30+
import warnings
31+
32+
import numpy as np
33+
import pytest
34+
import xarray as xr
35+
36+
from xrspatial.geotiff import GeoTIFFFallbackWarning, read_vrt, to_geotiff
37+
38+
39+
def _make_partial_vrt(tmp_path) -> tuple[str, str]:
40+
"""Build a 2-source VRT with one present + one missing source.
41+
42+
Returns ``(vrt_path, present_src_path)``. The VRT references the
43+
present source for the left half and a non-existent file for the
44+
right half, so chunked reads against the right half hit the
45+
missing-source decode path.
46+
"""
47+
src = os.path.join(tmp_path, "src_present.tif")
48+
arr = np.full((4, 4), 7.0, dtype=np.float32)
49+
da = xr.DataArray(
50+
arr, dims=("y", "x"),
51+
attrs={"transform": (1.0, 0.0, 0.0, 0.0, -1.0, 0.0)},
52+
)
53+
to_geotiff(da, src)
54+
55+
missing = os.path.join(tmp_path, "missing.tif")
56+
vrt_path = os.path.join(tmp_path, "partial.vrt")
57+
with open(vrt_path, "w") as f:
58+
f.write(
59+
f'<VRTDataset rasterXSize="8" rasterYSize="4">\n'
60+
'<GeoTransform>0.0, 1.0, 0.0, 0.0, 0.0, -1.0</GeoTransform>\n'
61+
'<VRTRasterBand dataType="Float32" band="1">\n'
62+
'<SimpleSource>\n'
63+
f'<SourceFilename relativeToVRT="0">{src}</SourceFilename>\n'
64+
'<SourceBand>1</SourceBand>\n'
65+
'<SrcRect xOff="0" yOff="0" xSize="4" ySize="4"/>\n'
66+
'<DstRect xOff="0" yOff="0" xSize="4" ySize="4"/>\n'
67+
'</SimpleSource>\n'
68+
'<SimpleSource>\n'
69+
f'<SourceFilename relativeToVRT="0">{missing}</SourceFilename>\n'
70+
'<SourceBand>1</SourceBand>\n'
71+
'<SrcRect xOff="0" yOff="0" xSize="4" ySize="4"/>\n'
72+
'<DstRect xOff="4" yOff="0" xSize="4" ySize="4"/>\n'
73+
'</SimpleSource>\n'
74+
'</VRTRasterBand>\n'
75+
'</VRTDataset>\n'
76+
)
77+
return vrt_path, src
78+
79+
80+
class TestChunkedMissingSourcesWarn:
81+
"""``read_vrt(chunks=N, missing_sources='warn')`` records holes at build.
82+
83+
The eager path scans every source at decode time. The chunked path
84+
cannot afford that sweep up front (it would defeat the lazy graph),
85+
so it uses ``os.path.exists`` to populate ``vrt_holes`` at build
86+
time. The compute step still emits per-task warnings for any
87+
missing source that survives.
88+
"""
89+
90+
def test_vrt_holes_populated_at_build(self, tmp_path):
91+
vrt_path, _ = _make_partial_vrt(str(tmp_path))
92+
result = read_vrt(vrt_path, chunks=4, missing_sources="warn")
93+
assert "vrt_holes" in result.attrs, (
94+
"Chunked path must populate vrt_holes at build time so "
95+
"callers can detect partial mosaics without forcing a "
96+
"compute (issue #1734)."
97+
)
98+
holes = result.attrs["vrt_holes"]
99+
assert len(holes) == 1
100+
assert holes[0]["source"].endswith("missing.tif")
101+
102+
def test_compute_emits_per_task_warning(self, tmp_path):
103+
vrt_path, _ = _make_partial_vrt(str(tmp_path))
104+
with warnings.catch_warnings(record=True) as caught:
105+
warnings.simplefilter("always")
106+
result = read_vrt(vrt_path, chunks=4, missing_sources="warn")
107+
computed = result.compute()
108+
messages = [str(w.message) for w in caught
109+
if isinstance(w.message, GeoTIFFFallbackWarning)]
110+
assert any("missing.tif" in msg for msg in messages), (
111+
f"Expected GeoTIFFFallbackWarning naming the missing "
112+
f"source after compute, got messages: {messages!r}"
113+
)
114+
# Present-source chunk decodes its 7.0 fill; missing-source
115+
# chunk decodes to the source dtype's default fill (typically
116+
# zero for float32). Pin the present-side decode value so a
117+
# regression in the lenient path that wiped both halves would
118+
# surface.
119+
np.testing.assert_array_equal(
120+
np.asarray(computed)[:, :4], np.full((4, 4), 7.0, dtype=np.float32),
121+
)
122+
123+
def test_chunks_tuple_form(self, tmp_path):
124+
"""Tuple ``chunks=(h, w)`` threads through identically."""
125+
vrt_path, _ = _make_partial_vrt(str(tmp_path))
126+
result = read_vrt(
127+
vrt_path, chunks=(2, 4), missing_sources="warn",
128+
)
129+
assert "vrt_holes" in result.attrs
130+
# 2 chunks vertically * 2 chunks horizontally = 4 tasks.
131+
# The missing source is in column 1 (cols 4-7); only the right
132+
# half should produce warning records, but vrt_holes is a
133+
# parse-time sweep so it records the source once regardless.
134+
assert len(result.attrs["vrt_holes"]) == 1
135+
136+
137+
class TestChunkedMissingSourcesRaise:
138+
"""``read_vrt(chunks=N, missing_sources='raise')`` fails on compute.
139+
140+
The eager path raises at read time. The chunked path defers to
141+
compute because each chunk's decode is delayed; an upfront raise
142+
would force the parse-time sweep to decode every source, defeating
143+
the lazy graph. The contract: chunks intersecting a missing source
144+
raise on compute; chunks intersecting only present sources still
145+
succeed.
146+
"""
147+
148+
def test_compute_intersecting_missing_raises(self, tmp_path):
149+
vrt_path, _ = _make_partial_vrt(str(tmp_path))
150+
result = read_vrt(vrt_path, chunks=4, missing_sources="raise")
151+
# Build does not raise (the graph is lazy).
152+
# Computing a chunk that intersects the missing source raises.
153+
with pytest.raises((OSError, FileNotFoundError, ValueError)):
154+
result.compute()
155+
156+
def test_compute_present_only_chunk_succeeds(self, tmp_path):
157+
"""A windowed compute against only the present source succeeds.
158+
159+
``read_vrt(window=...)`` restricts the chunked graph to the
160+
windowed extent; if the window misses the missing source, no
161+
chunk needs to decode it and compute succeeds even under
162+
``missing_sources='raise'``. The contract: the raise policy is
163+
scoped to chunks that actually touch missing sources.
164+
"""
165+
vrt_path, _ = _make_partial_vrt(str(tmp_path))
166+
# Window covers only the present source (cols 0-4).
167+
result = read_vrt(
168+
vrt_path, chunks=4, window=(0, 0, 4, 4),
169+
missing_sources="raise",
170+
)
171+
computed = result.compute()
172+
np.testing.assert_array_equal(
173+
np.asarray(computed), np.full((4, 4), 7.0, dtype=np.float32),
174+
)
175+
176+
177+
class TestChunkedMissingSourcesDefault:
178+
"""The default ``missing_sources`` on chunked reads is ``'raise'``.
179+
180+
The public ``read_vrt`` default flipped to ``'raise'`` in #1843 /
181+
#1860. The chunked path goes through the same entry point so the
182+
default must agree. A regression flipping the chunked default to
183+
``'warn'`` would silently produce partial mosaics for callers who
184+
don't pass the kwarg.
185+
"""
186+
187+
def test_chunked_default_raises_on_compute(self, tmp_path):
188+
vrt_path, _ = _make_partial_vrt(str(tmp_path))
189+
result = read_vrt(vrt_path, chunks=4)
190+
with pytest.raises((OSError, FileNotFoundError, ValueError)):
191+
result.compute()
192+
193+
194+
class TestChunkedMissingSourcesValidation:
195+
"""Invalid ``missing_sources`` policies are rejected at entry."""
196+
197+
def test_invalid_policy_raises_at_build(self, tmp_path):
198+
vrt_path, _ = _make_partial_vrt(str(tmp_path))
199+
with pytest.raises(ValueError, match="missing_sources"):
200+
read_vrt(vrt_path, chunks=4, missing_sources="ignore")
201+
202+
def test_invalid_policy_raises_without_chunks_too(self, tmp_path):
203+
"""Sanity: the eager path also rejects the bad value. Pinning
204+
cross-mode parity means callers see the same error whether or
205+
not they pass ``chunks=``."""
206+
vrt_path, _ = _make_partial_vrt(str(tmp_path))
207+
with pytest.raises(ValueError, match="missing_sources"):
208+
read_vrt(vrt_path, missing_sources="ignore")

0 commit comments

Comments
 (0)