diff --git a/xrspatial/geotiff/_decode.py b/xrspatial/geotiff/_decode.py index 363c655c..22124c26 100644 --- a/xrspatial/geotiff/_decode.py +++ b/xrspatial/geotiff/_decode.py @@ -15,14 +15,15 @@ the top of ``_reader.py``. This module deliberately has *no* module-level import of -:mod:`xrspatial.geotiff._reader` so the two files can sit on either side -of a circular relationship: ``_reader.py`` imports the decode functions -back at module load, and the few things ``_decode`` still needs from -``_reader`` (``MAX_PIXELS_DEFAULT``, ``_check_dimensions``, +:mod:`xrspatial.geotiff._reader` or :mod:`xrspatial.geotiff._layout`. +``_reader.py`` imports the decode functions back at module load, and +the layout helpers (``MAX_PIXELS_DEFAULT``, ``_check_dimensions``, ``_check_source_dimensions``, ``_sparse_fill_value``, ``_has_sparse``) -are imported lazily inside ``_read_strips`` / ``_read_tiles`` at call -time. Those names move with ``_layout.py`` in PR-H (issue #2247), at -which point the lazy imports can collapse back into top-level ones. +that PR-H moved into :mod:`._layout` are imported lazily inside +``_read_strips`` / ``_read_tiles`` at call time so that +``_layout._sparse_fill_value``'s own lazy import of +``_int_nodata_in_range`` from this module cannot turn into a +module-load cycle. """ from __future__ import annotations @@ -57,14 +58,20 @@ _NATIVE_ORDER = '<' if _sys.byteorder == 'little' else '>' #: Sentinel used as the default value of ``max_pixels`` so the actual -#: ``MAX_PIXELS_DEFAULT`` can stay in :mod:`._reader` without making this +#: ``MAX_PIXELS_DEFAULT`` can stay in :mod:`._layout` without making this #: module import-time dependent on it. ``_resolve_max_pixels`` does the #: lazy lookup at call time. _MAX_PIXELS_UNSET = object() def _resolve_max_pixels(value): - """Return ``MAX_PIXELS_DEFAULT`` when *value* is the unset sentinel.""" + """Return ``MAX_PIXELS_DEFAULT`` when *value* is the unset sentinel. + + The lookup hits :mod:`._reader` rather than :mod:`._layout` so test + monkeypatches of ``_reader.MAX_PIXELS_DEFAULT`` keep taking effect + -- the layout extraction (issue #2247) introduced the alias on + ``_reader`` precisely so the long-standing patch contract survives. + """ if value is _MAX_PIXELS_UNSET: from ._reader import MAX_PIXELS_DEFAULT return MAX_PIXELS_DEFAULT @@ -319,13 +326,10 @@ def _read_strips(data: bytes, ifd: IFD, header: TIFFHeader, ------- np.ndarray with shape (height, width) or windowed subset. """ - # Imported lazily so this module does not have an import-time - # dependency on ``_reader``. The pixel-safety guards - # (``_check_dimensions``/``_check_source_dimensions``) and the - # sparse-layout helpers (``_sparse_fill_value``/``_has_sparse``) - # both stay in ``_reader.py`` until PR-H (issue #2247); rebinding - # them here keeps PR-G mechanical. - from ._reader import ( + # Layout / validation helpers live in ``_layout`` (issue #2247). + # Imported lazily so the decode module stays cycle-free against the + # layout module's lazy import of ``_int_nodata_in_range`` from here. + from ._layout import ( _check_dimensions, _check_source_dimensions, _has_sparse, @@ -551,8 +555,12 @@ def _read_tiles(data: bytes, ifd: IFD, header: TIFFHeader, ------- np.ndarray with shape (height, width) or windowed subset. """ - from ._reader import ( - MAX_PIXELS_DEFAULT, + # ``MAX_PIXELS_DEFAULT`` is read off ``_reader`` (which re-exports + # the layout module's binding) so tests that monkeypatch + # ``_reader.MAX_PIXELS_DEFAULT`` keep taking effect on the per-tile + # path. The function-level helpers come from ``_layout`` directly. + from ._reader import MAX_PIXELS_DEFAULT + from ._layout import ( _check_dimensions, _has_sparse, _sparse_fill_value, diff --git a/xrspatial/geotiff/_layout.py b/xrspatial/geotiff/_layout.py new file mode 100644 index 00000000..a7933145 --- /dev/null +++ b/xrspatial/geotiff/_layout.py @@ -0,0 +1,205 @@ +"""TIFF/COG layout validation and byte-budget helpers. + +This module is private to :mod:`xrspatial.geotiff`. It collects the +read-side layout helpers that were previously colocated with the +top-level reader: pixel-count safety limits, source-dimension +validation, sparse-tile/strip handling, the per-image byte budget used +to bound HTTP body reads, and the IFD-extent probe used by the COG +HTTP prefetch grow loop. + +The helpers are intentionally free of decode (codec) and transport +(HTTP / mmap / fsspec) concerns. Decode lives in :mod:`._decode`, +transport lives in :mod:`._sources`, top-level orchestration lives in +:mod:`._reader`. + +Source: PR-H of the GeoTIFF refactor epic, issue #2247. +""" +from __future__ import annotations + +import math + +import numpy as np + +from ._decode import _int_nodata_in_range +from ._geotags import _parse_nodata_str as _parse_nd +from ._header import IFD +from ._sources import _max_tile_bytes_from_env + +# --------------------------------------------------------------------------- +# Allocation guard: reject TIFF dimensions that would exhaust memory +# --------------------------------------------------------------------------- + +#: Default maximum total pixel count (width * height * samples). +#: ~1 billion pixels, which is ~4 GB for float32 single-band. +#: Override per-call via the ``max_pixels`` keyword argument. +MAX_PIXELS_DEFAULT = 1_000_000_000 + + +class PixelSafetyLimitError(ValueError): + """Raised when a requested TIFF allocation exceeds max_pixels.""" + + +def _check_dimensions(width, height, samples, max_pixels): + """Raise PixelSafetyLimitError if the request exceeds *max_pixels*.""" + total = width * height * samples + if total > max_pixels: + raise PixelSafetyLimitError( + f"TIFF image dimensions ({width} x {height} x {samples} = " + f"{total:,} pixels) exceed the safety limit of " + f"{max_pixels:,} pixels. Pass a larger max_pixels value to " + f"read_to_array() if this file is legitimate." + ) + + +def _check_source_dimensions(width, height, samples): + """Validate the source IFD dimensions of a TIFF before any windowing. + + Companion to :func:`_check_dimensions`, which only enforces the + upper bound. The stripped read paths read ``width``, ``height``, + and ``samples_per_pixel`` straight off the IFD and then clamp the + output window to those values, so a malformed file with + ``ImageWidth = 0`` (or a negative value, which would parse as a + huge unsigned int but can also surface via signed-cast errors) + would produce an empty array silently. The tiled paths are already + protected by :func:`validate_tile_layout` in ``_header.py``; this + helper closes the same gap for the stripped path. Issue #2053. + """ + if width <= 0 or height <= 0 or samples <= 0: + raise ValueError( + f"Invalid TIFF dimensions: ImageWidth={width}, " + f"ImageLength={height}, SamplesPerPixel={samples} " + f"(all must be > 0)" + ) + + +# --------------------------------------------------------------------------- +# Sparse tile / strip handling +# --------------------------------------------------------------------------- + + +def _sparse_fill_value(ifd: IFD, dtype: np.dtype): + """Resolve the fill value for sparse tiles/strips. + + A sparse TIFF entry has TileByteCounts/StripByteCounts == 0 (and + typically the matching Offset == 0). GDAL emits these for SPARSE_OK + files where blocks containing only the nodata value are omitted. + The reader is expected to materialise such blocks as nodata, or + zero when nodata is unset (the default per the GDAL convention). + """ + nodata_str = ifd.nodata_str + if nodata_str is not None: + # Try ``int`` first so 64-bit sentinels survive without the + # float64 round-trip; fall back to ``float`` for NaN / Inf / + # scientific notation / fractional values. See issue #1847. + parsed = _parse_nd(nodata_str) + if parsed is not None: + if dtype.kind == 'f': + return dtype.type(parsed) + if isinstance(parsed, int): + if _int_nodata_in_range(parsed, dtype): + return dtype.type(parsed) + elif not math.isnan(parsed) and not math.isinf(parsed): + if float(parsed).is_integer(): + nodata_int = int(parsed) + if _int_nodata_in_range(nodata_int, dtype): + return dtype.type(nodata_int) + return dtype.type(0) + + +def _has_sparse(byte_counts) -> bool: + """Return True if any tile/strip is empty (byte_count == 0).""" + if byte_counts is None: + return False + for bc in byte_counts: + if bc == 0: + return True + return False + + +# --------------------------------------------------------------------------- +# Per-image byte budget (HTTP body bound for stripped TIFFs) +# --------------------------------------------------------------------------- + +#: Slack added to the strip-table byte budget for the TIFF header, +#: trailing IFD chain, ExifIFD, GeoKey directory, GDAL_METADATA, and any +#: ICC profile or XMP packet. 4 MiB is comfortable for real-world COGs +#: (the prefetch path already tolerates up to ``MAX_HTTP_HEADER_BYTES`` +#: of header bytes) while still bounding the body away from gigabyte +#: scale. Issue #2051. +_FULL_IMAGE_BUDGET_HEADER_SLACK = 4 * 1024 * 1024 + + +def _compute_full_image_byte_budget(offsets, byte_counts) -> int: + """Compute an upper bound on the legitimate HTTP body size for a stripped TIFF. + + A stripped TIFF body is laid out as: [TIFF header + IFDs + tag value + arrays] followed by strip payloads at the offsets listed in + ``StripOffsets``. The largest byte index any strip references is + ``max(offset + byte_count)`` across the strip table; the body cannot + legitimately extend past that point plus a small tail for trailing + metadata. We add :data:`_FULL_IMAGE_BUDGET_HEADER_SLACK` to cover the + header prologue (which lives at offset 0) and any tags that follow + the last strip. The cap is loose by design -- it exists to reject + bodies that are orders of magnitude larger than the file claims to + be, not to second-guess legitimate layouts. + + If the strip table is missing or empty (sparse-only, malformed), + fall back to the per-strip safety cap so the read is still bounded. + Issue #2051. + """ + fallback = _max_tile_bytes_from_env() + _FULL_IMAGE_BUDGET_HEADER_SLACK + if not offsets or not byte_counts: + return fallback + max_end = 0 + for off, bc in zip(offsets, byte_counts): + try: + end = int(off) + int(bc) + except (TypeError, ValueError): + continue + if end > max_end: + max_end = end + if max_end <= 0: + return fallback + return max_end + _FULL_IMAGE_BUDGET_HEADER_SLACK + + +# --------------------------------------------------------------------------- +# COG HTTP IFD-extent probe (grow loop) +# --------------------------------------------------------------------------- + + +def _ifd_required_extent(ifds: list[IFD]) -> int: + """Return the highest byte offset the parsed IFDs reference. + + Used to decide whether the prefetch buffer is large enough to hold the + entire IFD chain plus every out-of-line tag value. We compare this + against ``len(data)`` in :func:`_parse_cog_http_meta`; if it exceeds the + buffer, the chain is truncated and the caller must grow and retry. + + The walk re-derives each tag's value-area placement directly from the + IFD layout (entry table base + entry slot) rather than re-parsing the + raw bytes. For out-of-line tags ``parse_ifd`` already resolved the + pointer and validated ``ptr + size <= len(data)``; the *interesting* + extent for the grow loop is the next-IFD pointer of the chain tail, + plus an "is there a next IFD we have not yet seen" probe. + """ + if not ifds: + return 0 + + required = 0 + # Last IFD's next_ifd_offset: 0 means end-of-chain; anything else + # points at an IFD we haven't parsed yet because it sat past the + # buffer (parse_all_ifds stops on offset >= len(data)). + tail_next = ifds[-1].next_ifd_offset + if tail_next != 0: + # Need at least enough bytes to reach the next IFD header. Pad + # by a small amount so parse_ifd can read the num_entries field + # without truncation -- the actual entry table is bounded by the + # parser's own checks on the next grow iteration. + required = max(required, tail_next + 64) + + # Out-of-line tag values are already parsed (parse_ifd bounds-checked + # ptr + total_size <= len(data) before reading). For grow logic we + # only need to ensure those checks did not *fail*; a thrown + # ValueError surfaces in parse_all_ifds and is handled by the loop. + return required diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py index 05fb4c8e..a13a592f 100644 --- a/xrspatial/geotiff/_reader.py +++ b/xrspatial/geotiff/_reader.py @@ -46,52 +46,26 @@ ) from ._validation import _validate_predictor_sample_format -# --------------------------------------------------------------------------- -# Allocation guard: reject TIFF dimensions that would exhaust memory -# --------------------------------------------------------------------------- - -#: Default maximum total pixel count (width * height * samples). -#: ~1 billion pixels, which is ~4 GB for float32 single-band. -#: Override per-call via the ``max_pixels`` keyword argument. -MAX_PIXELS_DEFAULT = 1_000_000_000 - - -class PixelSafetyLimitError(ValueError): - """Raised when a requested TIFF allocation exceeds max_pixels.""" - - -def _check_dimensions(width, height, samples, max_pixels): - """Raise PixelSafetyLimitError if the request exceeds *max_pixels*.""" - total = width * height * samples - if total > max_pixels: - raise PixelSafetyLimitError( - f"TIFF image dimensions ({width} x {height} x {samples} = " - f"{total:,} pixels) exceed the safety limit of " - f"{max_pixels:,} pixels. Pass a larger max_pixels value to " - f"read_to_array() if this file is legitimate." - ) - - -def _check_source_dimensions(width, height, samples): - """Validate the source IFD dimensions of a TIFF before any windowing. - - Companion to :func:`_check_dimensions`, which only enforces the - upper bound. The stripped read paths read ``width``, ``height``, - and ``samples_per_pixel`` straight off the IFD and then clamp the - output window to those values, so a malformed file with - ``ImageWidth = 0`` (or a negative value, which would parse as a - huge unsigned int but can also surface via signed-cast errors) - would produce an empty array silently. The tiled paths are already - protected by :func:`validate_tile_layout` in ``_header.py``; this - helper closes the same gap for the stripped path. Issue #2053. - """ - if width <= 0 or height <= 0 or samples <= 0: - raise ValueError( - f"Invalid TIFF dimensions: ImageWidth={width}, " - f"ImageLength={height}, SamplesPerPixel={samples} " - f"(all must be > 0)" - ) - +# Layout / validation helpers live in ``_layout``. They are imported back +# here so that: +# * existing call sites inside this module keep their bare names, and +# * the historical public import surface +# (``from xrspatial.geotiff._reader import PixelSafetyLimitError``, +# ``MAX_PIXELS_DEFAULT``, ``_check_dimensions`` and friends -- used +# by sidecar / VRT / GPU / dask backends and by the test suite) is +# preserved without churn. +# Source: PR-H of the GeoTIFF refactor epic, issue #2247. +from ._layout import ( # noqa: F401 + MAX_PIXELS_DEFAULT, + PixelSafetyLimitError, + _FULL_IMAGE_BUDGET_HEADER_SLACK, + _check_dimensions, + _check_source_dimensions, + _compute_full_image_byte_budget, + _has_sparse, + _ifd_required_extent, + _sparse_fill_value, +) # The data-source layer (local mmap, HTTP with SSRF defences and DNS-rebind # pinning, fsspec cloud, BytesIO) lives in ``_sources``. It is imported back @@ -176,89 +150,6 @@ def _check_source_dimensions(width, height, samples): ) -def _sparse_fill_value(ifd: IFD, dtype: np.dtype): - """Resolve the fill value for sparse tiles/strips. - - A sparse TIFF entry has TileByteCounts/StripByteCounts == 0 (and - typically the matching Offset == 0). GDAL emits these for SPARSE_OK - files where blocks containing only the nodata value are omitted. - The reader is expected to materialise such blocks as nodata, or - zero when nodata is unset (the default per the GDAL convention). - """ - nodata_str = ifd.nodata_str - if nodata_str is not None: - # Try ``int`` first so 64-bit sentinels survive without the - # float64 round-trip; fall back to ``float`` for NaN / Inf / - # scientific notation / fractional values. See issue #1847. - from ._geotags import _parse_nodata_str as _parse_nd - parsed = _parse_nd(nodata_str) - if parsed is not None: - if dtype.kind == 'f': - return dtype.type(parsed) - if isinstance(parsed, int): - if _int_nodata_in_range(parsed, dtype): - return dtype.type(parsed) - elif not math.isnan(parsed) and not math.isinf(parsed): - if float(parsed).is_integer(): - nodata_int = int(parsed) - if _int_nodata_in_range(nodata_int, dtype): - return dtype.type(nodata_int) - return dtype.type(0) - - -def _has_sparse(byte_counts) -> bool: - """Return True if any tile/strip is empty (byte_count == 0).""" - if byte_counts is None: - return False - for bc in byte_counts: - if bc == 0: - return True - return False - - -#: Slack added to the strip-table byte budget for the TIFF header, -#: trailing IFD chain, ExifIFD, GeoKey directory, GDAL_METADATA, and any -#: ICC profile or XMP packet. 4 MiB is comfortable for real-world COGs -#: (the prefetch path already tolerates up to ``MAX_HTTP_HEADER_BYTES`` -#: of header bytes) while still bounding the body away from gigabyte -#: scale. Issue #2051. -_FULL_IMAGE_BUDGET_HEADER_SLACK = 4 * 1024 * 1024 - - -def _compute_full_image_byte_budget(offsets, byte_counts) -> int: - """Compute an upper bound on the legitimate HTTP body size for a stripped TIFF. - - A stripped TIFF body is laid out as: [TIFF header + IFDs + tag value - arrays] followed by strip payloads at the offsets listed in - ``StripOffsets``. The largest byte index any strip references is - ``max(offset + byte_count)`` across the strip table; the body cannot - legitimately extend past that point plus a small tail for trailing - metadata. We add :data:`_FULL_IMAGE_BUDGET_HEADER_SLACK` to cover the - header prologue (which lives at offset 0) and any tags that follow - the last strip. The cap is loose by design -- it exists to reject - bodies that are orders of magnitude larger than the file claims to - be, not to second-guess legitimate layouts. - - If the strip table is missing or empty (sparse-only, malformed), - fall back to the per-strip safety cap so the read is still bounded. - Issue #2051. - """ - fallback = _max_tile_bytes_from_env() + _FULL_IMAGE_BUDGET_HEADER_SLACK - if not offsets or not byte_counts: - return fallback - max_end = 0 - for off, bc in zip(offsets, byte_counts): - try: - end = int(off) + int(bc) - except (TypeError, ValueError): - continue - if end > max_end: - max_end = end - if max_end <= 0: - return fallback - return max_end + _FULL_IMAGE_BUDGET_HEADER_SLACK - - # --------------------------------------------------------------------------- # COG HTTP reader # --------------------------------------------------------------------------- @@ -275,45 +166,6 @@ def _compute_full_image_byte_budget(offsets, byte_counts) -> int: MAX_HTTP_HEADER_BYTES = 4 * 1024 * 1024 -def _ifd_required_extent( - ifds: list[IFD], header: TIFFHeader, data_len: int, -) -> int: - """Return the highest byte offset the parsed IFDs reference. - - Used to decide whether the prefetch buffer is large enough to hold the - entire IFD chain plus every out-of-line tag value. We compare this - against ``len(data)`` in :func:`_parse_cog_http_meta`; if it exceeds the - buffer, the chain is truncated and the caller must grow and retry. - - The walk re-derives each tag's value-area placement directly from the - IFD layout (entry table base + entry slot) rather than re-parsing the - raw bytes. For out-of-line tags ``parse_ifd`` already resolved the - pointer and validated ``ptr + size <= data_len``; the *interesting* - extent for the grow loop is the next-IFD pointer of the chain tail, - plus an "is there a next IFD we have not yet seen" probe. - """ - if not ifds: - return 0 - - required = 0 - # Last IFD's next_ifd_offset: 0 means end-of-chain; anything else - # points at an IFD we haven't parsed yet because it sat past the - # buffer (parse_all_ifds stops on offset >= len(data)). - tail_next = ifds[-1].next_ifd_offset - if tail_next != 0: - # Need at least enough bytes to reach the next IFD header. Pad - # by a small amount so parse_ifd can read the num_entries field - # without truncation -- the actual entry table is bounded by the - # parser's own checks on the next grow iteration. - required = max(required, tail_next + 64) - - # Out-of-line tag values are already parsed (parse_ifd bounds-checked - # ptr + total_size <= len(data) before reading). For grow logic we - # only need to ensure those checks did not *fail*; a thrown - # ValueError surfaces in parse_all_ifds and is handled by the loop. - return required - - def _parse_cog_http_meta( source: _HTTPSource, overview_level: int | None = None, @@ -386,7 +238,7 @@ def _parse_cog_http_meta( while True: try: ifds = parse_all_ifds(header_bytes, header) - required = _ifd_required_extent(ifds, header, len(header_bytes)) + required = _ifd_required_extent(ifds) # Chain is fully resolved when every IFD parsed cleanly and # the tail next_ifd_offset is reachable within the buffer # (required == 0 means end-of-chain).