Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 15 additions & 6 deletions docs/source/reference/geotiff.rst
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,18 @@ VRT missing sources
===================

``read_vrt`` accepts ``missing_sources='warn'`` or ``'raise'``. The default
``'warn'`` preserves the historical behavior: unreadable source files emit
:class:`xrspatial.geotiff.GeoTIFFFallbackWarning`, the returned DataArray
contains ``attrs['vrt_holes']``, and the mosaic is returned with holes.
Use ``missing_sources='raise'`` when a partial mosaic should fail the
pipeline immediately. ``XRSPATIAL_GEOTIFF_STRICT=1`` still raises in
``'warn'`` mode so CI environments can enforce fail-fast behavior globally.
``'raise'`` (since #1860) fails the read immediately if any source file
referenced by the VRT does not exist on disk. Both the eager and chunked
dispatchers honour this at construction time -- chunked callers do not
have to wait until ``compute()`` to learn the VRT is broken (#2265).
The static missing-source sweep is scoped to the requested ``window=``
and ``band=`` so a windowed or band-restricted read that does not depend
on a missing source still succeeds.

Pass ``missing_sources='warn'`` to opt into the lenient path: unreadable
source files emit :class:`xrspatial.geotiff.GeoTIFFFallbackWarning`, the
returned DataArray carries ``attrs['vrt_holes']``, and the mosaic is
returned with holes left as the band's nodata sentinel (or zero on
integer bands without a sentinel). ``XRSPATIAL_GEOTIFF_STRICT=1``
forces the raise in ``'warn'`` mode too, so CI environments can enforce
fail-fast behavior globally.
86 changes: 81 additions & 5 deletions xrspatial/geotiff/_backends/vrt.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,8 +185,13 @@ def read_vrt(source: str, *,
unreadable backing source so a partial mosaic never surfaces
silently. This matches the internal ``_vrt.read_vrt`` default
and the rest of the geotiff module's up-front rejection of
malformed input. Prior to #1860 the public default was
``'warn'``; callers that relied on the lenient behaviour pass
malformed input. Both the eager and chunked dispatchers raise
at construction time when the static missing-source sweep
finds any source file that does not exist on disk and
intersects the requested window (#2265); chunked callers no
longer have to wait until ``compute()`` to learn the VRT is
broken. Prior to #1860 the public default was ``'warn'``;
callers that relied on the lenient behaviour pass
``missing_sources='warn'`` explicitly.
``'warn'`` is the opt-in escape hatch for partial mosaics: it
emits ``GeoTIFFFallbackWarning``, records ``attrs['vrt_holes']``,
Expand Down Expand Up @@ -695,6 +700,7 @@ def _read_vrt_chunked(source, *, window, band, name, chunks, gpu, dtype,
import dask.array as da

from .._reader import MAX_PIXELS_DEFAULT
from .._runtime import _geotiff_strict_mode
from .._vrt import (
parse_vrt,
_read_vrt_xml,
Expand Down Expand Up @@ -972,17 +978,87 @@ def _read_vrt_chunked(source, *, window, band, name, chunks, gpu, dtype,
# actually present. Each entry mirrors the eager schema:
# ``{'source', 'band', 'dst_rect', 'error'}``.
chunked_holes: list[dict] = []
for vrt_band in vrt.bands:
for band_idx, vrt_band in enumerate(vrt.bands):
# When ``band`` is restricted, the per-chunk decode never touches
# bands outside the selection, so a missing source on an
# unrelated band does not affect the mosaic and should not
# populate ``vrt_holes`` (mirrors the eager path, which only
# decodes the selected band's sources). ``band`` is a 0-based
# index into ``vrt.bands``, same convention as the
# ``selected_bands = [vrt.bands[band]]`` slice above. We compare
# against ``band_idx`` rather than ``vrt_band.band_num``
# (the XML's 1-based ``band=`` attribute) because the XML
# attribute does not have to match list position on hand-rolled
# VRTs.
if band is not None and band_idx != band:
continue
for src in vrt_band.sources:
if not _os.path.exists(src.filename):
# Skip holes that fall entirely outside the requested
# window. Each chunk task only decodes sources that
# intersect its destination rect, so a missing source
# outside the window never gets touched and the eager
# path with the same window would also not raise.
# ``win_r0/win_c0`` are the row/col origin of the
# requested window in the VRT's destination coordinate
# space and ``full_h/full_w`` are its size.
dst = src.dst_rect
if not (
dst.x_off + dst.x_size > win_c0
and dst.x_off < win_c0 + full_w
and dst.y_off + dst.y_size > win_r0
and dst.y_off < win_r0 + full_h
):
continue
chunked_holes.append({
'source': src.filename,
'band': vrt_band.band_num,
'dst_rect': (src.dst_rect.x_off, src.dst_rect.y_off,
src.dst_rect.x_size, src.dst_rect.y_size),
'dst_rect': (dst.x_off, dst.y_off,
dst.x_size, dst.y_size),
'error': 'FileNotFoundError: source file not found',
})

# Fail-fast for ``missing_sources='raise'`` (the public default since
# #1860). The docstring at the top of ``read_vrt`` promises that
# ``'raise'`` "fails immediately on an unreadable backing source so a
# partial mosaic never surfaces silently". Without this guard the
# chunked path constructs a delayed graph whose tasks each raise
# individually at compute time; if the caller never computes a chunk
# that touches a missing source (e.g. windowed downstream slicing
# past the hole), the raise never fires and the partial mosaic ships
# silently. The static ``os.path.exists`` sweep above already has the
# information needed to raise up front -- no extra source decoding
# required. ``XRSPATIAL_GEOTIFF_STRICT=1`` also forces the raise
# regardless of the kwarg, matching the eager path's strict-mode
# contract. See issue #2265.
if chunked_holes and (
missing_sources == 'raise' or _geotiff_strict_mode()
):
# Surface the first few missing paths in the message so the
# caller can act on them without having to flip to ``'warn'``
# and re-parse the resulting ``attrs['vrt_holes']``. Cap the
# preview at 3 entries to keep the error string bounded on
# mosaics with many missing tiles -- the total count is
# appended so the caller still knows the full magnitude.
preview_max = 3
preview = chunked_holes[:preview_max]
preview_str = ', '.join(
f"{h['source']!r} (band {h['band']})" for h in preview
)
more = len(chunked_holes) - len(preview)
if more > 0:
preview_str += f" and {more} more"
raise FileNotFoundError(
f"VRT references missing source file(s) that intersect "
f"the requested window: {preview_str}. The chunked VRT "
f"read aborts up front under missing_sources='raise' "
f"(the default) so a partial mosaic never surfaces "
f"silently. Pass missing_sources='warn' to opt into the "
f"lenient path that records holes in attrs['vrt_holes'] "
f"and warns at compute time. "
f"{len(chunked_holes)} missing source(s) total."
)

# Wave 3 of #2162: route attrs assembly through
# ``_finalize_lazy_read_attrs`` so the VRT chunked path shares the
# validate-then-populate-then-stamp block with the eager VRT path
Expand Down
17 changes: 16 additions & 1 deletion xrspatial/geotiff/tests/test_read_vrt_lazy_chunks_1798.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,27 @@ def test_read_vrt_chunks_matches_eager_values(tmp_path):


def test_read_vrt_chunks_does_not_read_sources_during_construction(tmp_path):
"""The chunked path must not eagerly decode sources at build.

Construction does run a cheap ``os.path.exists`` sweep over each
source (to populate ``vrt_holes`` and to fail-fast under the
default ``missing_sources='raise'``), but it must not open or
decode any source file. This test pairs the missing source with
the lenient ``missing_sources='warn'`` opt-in so the build
succeeds; the assertion is that no decode-time warnings (which
would only fire if the source were actually read) leak out
during construction.
"""
vrt = tmp_path / "tmp_1798_missing_source.vrt"
_write_vrt(vrt, "missing.tif")

with warnings.catch_warnings(record=True) as caught:
lazy = read_vrt(str(vrt), chunks=2)
lazy = read_vrt(str(vrt), chunks=2, missing_sources="warn")

# Build-time warnings from the decode codecs should be absent.
# ``missing_sources='warn'`` does not warn at build time either; the
# per-task ``GeoTIFFFallbackWarning`` only fires when a chunk
# actually decodes the missing tile during ``compute()``.
assert caught == []
assert hasattr(lazy.data, 'compute')

Expand Down
Loading
Loading