|
| 1 | +"""Release gate: stable-codec read/write/read round-trip (epic #2341). |
| 2 | +
|
| 3 | +PR 4 of 5 of epic #2341. The release contract names a specific set of |
| 4 | +codecs as ``stable`` in :data:`xrspatial.geotiff.SUPPORTED_FEATURES`: |
| 5 | +``none``, ``deflate``, ``lzw``, ``zstd``, ``packbits``. The release |
| 6 | +notes promise that on any of these codecs, a round-trip preserves both |
| 7 | +bit-exact pixels AND every canonical release attr key, on every dtype |
| 8 | +the library promises to round-trip. |
| 9 | +
|
| 10 | +Existing tests split the contract: |
| 11 | +
|
| 12 | +* ``test_compression.py`` covers codec internals (LZW dictionary edge |
| 13 | + cases, PackBits boundary cases, deflate stream framing). |
| 14 | +* ``test_supported_features_tiers_2137.py`` pins the |
| 15 | + ``SUPPORTED_FEATURES`` tier table. |
| 16 | +* ``test_release_gate_codecs.py`` pins lossless pixel round-trip for |
| 17 | + two dtypes (``uint16``, ``float32``). |
| 18 | +
|
| 19 | +This file is the joint gate: the cartesian product of every stable |
| 20 | +codec with every promised dtype, asserting both pixel equality AND |
| 21 | +release-attr equality through a full read/write/read cycle. |
| 22 | +
|
| 23 | +Out of scope: |
| 24 | +
|
| 25 | +* Experimental codecs (``lerc``, ``jpeg2000``, ``j2k``, ``lz4``) -- |
| 26 | + release tier is ``experimental``; covered by |
| 27 | + ``test_supported_features_tiers_2137.py``. |
| 28 | +* Internal-only ``jpeg`` -- not part of the public surface. |
| 29 | +* COG layout (``test_release_gate_cog.py``). |
| 30 | +* Backend parity (``test_backend_parity_matrix.py``). |
| 31 | +""" |
| 32 | +from __future__ import annotations |
| 33 | + |
| 34 | +import uuid |
| 35 | + |
| 36 | +import numpy as np |
| 37 | +import pytest |
| 38 | +import xarray as xr |
| 39 | + |
| 40 | +from xrspatial.geotiff import SUPPORTED_FEATURES, open_geotiff, to_geotiff |
| 41 | +from xrspatial.geotiff._compression import (COMPRESSION_DEFLATE, COMPRESSION_LZW, |
| 42 | + COMPRESSION_NONE, COMPRESSION_PACKBITS, |
| 43 | + COMPRESSION_ZSTD) |
| 44 | +from xrspatial.geotiff._header import parse_header, parse_ifd |
| 45 | + |
| 46 | +# The stable lossless codec set. Kept in lockstep with the ``codec.*`` |
| 47 | +# entries tiered ``stable`` in |
| 48 | +# :data:`xrspatial.geotiff.SUPPORTED_FEATURES`. The drift guard at the |
| 49 | +# bottom of this file fails the build if the two sets disagree. |
| 50 | +STABLE_CODECS = ("none", "deflate", "lzw", "zstd", "packbits") |
| 51 | + |
| 52 | +# The dtype set the release contract promises to round-trip through |
| 53 | +# every stable codec. ``int16`` and ``int32`` exercise the signed |
| 54 | +# integer path; ``float32`` and ``float64`` exercise the IEEE float |
| 55 | +# path with NaN as the nodata sentinel. |
| 56 | +DTYPES = ("int16", "int32", "float32", "float64") |
| 57 | + |
| 58 | +# TIFF tag value the on-disk file should carry for each stable codec |
| 59 | +# name. The reader IFD parser exposes ``ifd.compression`` so we can |
| 60 | +# assert the on-disk tag without depending on a high-level |
| 61 | +# ``attrs['compression']`` key (none exists; see issue #2341). |
| 62 | +_CODEC_TO_TIFF_TAG = { |
| 63 | + "none": COMPRESSION_NONE, |
| 64 | + "deflate": COMPRESSION_DEFLATE, |
| 65 | + "lzw": COMPRESSION_LZW, |
| 66 | + "zstd": COMPRESSION_ZSTD, |
| 67 | + "packbits": COMPRESSION_PACKBITS, |
| 68 | +} |
| 69 | + |
| 70 | +# Per-dtype integer nodata sentinel. Float dtypes use NaN. The |
| 71 | +# integer sentinels are well outside the natural value range of the |
| 72 | +# fixture below (small ascending integers) so the sentinel never |
| 73 | +# collides with a real pixel. |
| 74 | +_INT_NODATA = { |
| 75 | + "int16": np.int16(-32768), |
| 76 | + "int32": np.int32(-2147483648), |
| 77 | +} |
| 78 | + |
| 79 | +# Release-attr keys the cartesian-product gate asserts on. These come |
| 80 | +# from the issue body (#2341) and from the canonical attrs the reader |
| 81 | +# emits (see ``test_release_gate_attrs_contract.py``). ``raster_type`` |
| 82 | +# is included even though it is only emitted when the source was |
| 83 | +# ``RasterPixelIsPoint``; we use a small fixture that defaults to |
| 84 | +# ``'area'`` so it is normalized below in ``_canonical_attrs``. |
| 85 | +_RELEASE_ATTR_KEYS = ( |
| 86 | + "transform", |
| 87 | + "crs", |
| 88 | + "crs_wkt", |
| 89 | + "nodata", |
| 90 | + "masked_nodata", |
| 91 | + "georef_status", |
| 92 | + "raster_type", |
| 93 | +) |
| 94 | + |
| 95 | + |
| 96 | +def _make_input(dtype_name: str) -> xr.DataArray: |
| 97 | + """Build a 128x128 DataArray of the given dtype. |
| 98 | +
|
| 99 | + Float arrays seed a NaN sentinel at (0, 0); integer arrays seed |
| 100 | + the per-dtype sentinel at (0, 0). The remaining pixels are a |
| 101 | + deterministic, non-trivial pattern so a per-axis flip or stride |
| 102 | + bug surfaces as a pixel mismatch. |
| 103 | + """ |
| 104 | + dtype = np.dtype(dtype_name) |
| 105 | + height, width = 128, 128 |
| 106 | + n = height * width |
| 107 | + if np.issubdtype(dtype, np.floating): |
| 108 | + arr = np.linspace(-100.0, 100.0, n, dtype=dtype).reshape(height, width) |
| 109 | + arr[0, 0] = np.nan |
| 110 | + nodata: float | int = float("nan") |
| 111 | + else: |
| 112 | + # Small positive ramp so the dtype min sentinel never collides |
| 113 | + # with a real pixel. The ramp climbs to ``n - 1 == 16383`` with |
| 114 | + # the 128*128 fixture, which fits in ``int16`` (max 32767). If |
| 115 | + # a future dtype with a smaller positive range is added (e.g. |
| 116 | + # ``int8``) the ramp would wrap and collide with the sentinel; |
| 117 | + # cap the ramp or shrink the fixture in that case. |
| 118 | + arr = np.arange(n, dtype=dtype).reshape(height, width) |
| 119 | + sentinel = _INT_NODATA[dtype_name] |
| 120 | + arr[0, 0] = sentinel |
| 121 | + nodata = sentinel |
| 122 | + |
| 123 | + # 30 m pixels with a descending y axis (top-left at the highest y |
| 124 | + # coord). The writer turns these into a GeoTransform of |
| 125 | + # ``(30, 0, origin_x, 0, -30, origin_y)``. |
| 126 | + y = 4000000.0 - 30.0 * (np.arange(height) + 0.5) |
| 127 | + x = 500000.0 + 30.0 * (np.arange(width) + 0.5) |
| 128 | + attrs: dict = {"crs": 32610, "nodata": nodata} |
| 129 | + return xr.DataArray( |
| 130 | + arr, |
| 131 | + dims=("y", "x"), |
| 132 | + coords={"y": y, "x": x}, |
| 133 | + attrs=attrs, |
| 134 | + ) |
| 135 | + |
| 136 | + |
| 137 | +def _canonical_attrs(da: xr.DataArray) -> dict: |
| 138 | + """Project a DataArray's ``attrs`` onto the release-attr key set. |
| 139 | +
|
| 140 | + ``raster_type`` is missing from ``attrs`` for the default ``area`` |
| 141 | + raster (the writer only stamps ``'point'`` explicitly); normalize |
| 142 | + here so the cross-read comparison can treat the missing key as |
| 143 | + equivalent to ``'area'``. |
| 144 | + """ |
| 145 | + out = {} |
| 146 | + for key in _RELEASE_ATTR_KEYS: |
| 147 | + if key == "raster_type": |
| 148 | + out[key] = da.attrs.get("raster_type", "area") |
| 149 | + else: |
| 150 | + out[key] = da.attrs.get(key) |
| 151 | + return out |
| 152 | + |
| 153 | + |
| 154 | +def _read_tiff_compression_tag(path: str) -> int: |
| 155 | + """Read the on-disk TIFF Compression tag from the first IFD. |
| 156 | +
|
| 157 | + The reader's high-level API does not surface ``attrs['compression']`` |
| 158 | + (issue #2341 question). Inspect the IFD directly so the test pins |
| 159 | + the actual on-disk codec choice rather than relying on the |
| 160 | + DataArray attrs the reader emits. |
| 161 | + """ |
| 162 | + with open(path, "rb") as fh: |
| 163 | + data = fh.read() |
| 164 | + header = parse_header(data) |
| 165 | + ifd = parse_ifd(data, header.first_ifd_offset, header) |
| 166 | + return ifd.compression |
| 167 | + |
| 168 | + |
| 169 | +def _assert_pixels_equal(actual: np.ndarray, expected: np.ndarray, |
| 170 | + *, codec: str, dtype_name: str) -> None: |
| 171 | + """NaN-aware byte-exact pixel comparison. |
| 172 | +
|
| 173 | + The float path uses ``equal_nan=True`` so the NaN sentinel |
| 174 | + matches NaN-to-NaN. The integer path uses strict |
| 175 | + ``array_equal`` -- the sentinel is just another integer value |
| 176 | + and must round-trip bit-exact. |
| 177 | + """ |
| 178 | + assert actual.shape == expected.shape, ( |
| 179 | + f"release gate (#2341): codec {codec!r} dtype {dtype_name!r} " |
| 180 | + f"reshaped the array across the round-trip: " |
| 181 | + f"{expected.shape} -> {actual.shape}" |
| 182 | + ) |
| 183 | + assert actual.dtype == expected.dtype, ( |
| 184 | + f"release gate (#2341): codec {codec!r} promoted dtype " |
| 185 | + f"{dtype_name!r} to {actual.dtype!r} across the round-trip" |
| 186 | + ) |
| 187 | + if np.issubdtype(expected.dtype, np.floating): |
| 188 | + equal = np.array_equal(actual, expected, equal_nan=True) |
| 189 | + else: |
| 190 | + equal = np.array_equal(actual, expected) |
| 191 | + if not equal: |
| 192 | + # Surface the first divergent pixel so a debug session can |
| 193 | + # jump straight to the offending tile / row. |
| 194 | + if np.issubdtype(expected.dtype, np.floating): |
| 195 | + mismatch_mask = ~( |
| 196 | + (actual == expected) | (np.isnan(actual) & np.isnan(expected)) |
| 197 | + ) |
| 198 | + else: |
| 199 | + mismatch_mask = actual != expected |
| 200 | + first = np.argwhere(mismatch_mask) |
| 201 | + first_idx = tuple(int(v) for v in first[0]) if first.size else None |
| 202 | + first_actual = ( |
| 203 | + actual[first_idx] if first_idx is not None else None |
| 204 | + ) |
| 205 | + first_expected = ( |
| 206 | + expected[first_idx] if first_idx is not None else None |
| 207 | + ) |
| 208 | + raise AssertionError( |
| 209 | + f"release gate (#2341): codec {codec!r} did not preserve " |
| 210 | + f"{dtype_name!r} pixels byte-for-byte; the release contract " |
| 211 | + f"names this codec as lossless for this dtype. First " |
| 212 | + f"divergence at index {first_idx!r}: actual=" |
| 213 | + f"{first_actual!r}, expected={first_expected!r}" |
| 214 | + ) |
| 215 | + |
| 216 | + |
| 217 | +@pytest.mark.release_gate |
| 218 | +@pytest.mark.parametrize("dtype_name", DTYPES) |
| 219 | +@pytest.mark.parametrize("codec", STABLE_CODECS) |
| 220 | +def test_release_gate_codec_round_trip(tmp_path, codec, dtype_name) -> None: |
| 221 | + """Stable codec * dtype: pixels and release attrs survive a full |
| 222 | + read/write/read cycle. |
| 223 | +
|
| 224 | + Steps: |
| 225 | +
|
| 226 | + 1. Build an in-memory DataArray with a known transform, CRS, and |
| 227 | + nodata sentinel (NaN for float; per-dtype int min for int). |
| 228 | + 2. Write via ``to_geotiff(path, compression=codec)``. |
| 229 | + 3. Read back via ``open_geotiff(path)`` -- this is the canonical |
| 230 | + baseline. The reader fills in ``crs_wkt``, |
| 231 | + ``georef_status``, ``masked_nodata``, etc. |
| 232 | + 4. Write the baseline DataArray to a second path under the same |
| 233 | + codec. |
| 234 | + 5. Read the second path back; assert byte-exact pixels and every |
| 235 | + release-attr key matches the baseline. |
| 236 | +
|
| 237 | + The two-pass shape is what makes this a *round-trip* gate |
| 238 | + rather than a single-pass write-and-read gate: the canonical |
| 239 | + attrs themselves have to survive the second cycle, not just the |
| 240 | + first. |
| 241 | + """ |
| 242 | + # Unique tag per parametrized case so parallel pytest workers and |
| 243 | + # parallel rockout worktrees never collide on the same tmp file. |
| 244 | + nonce = uuid.uuid4().hex[:8] |
| 245 | + write_first = str( |
| 246 | + tmp_path |
| 247 | + / f"release_gate_2341_{codec}_{dtype_name}_first_{nonce}.tif" |
| 248 | + ) |
| 249 | + write_second = str( |
| 250 | + tmp_path |
| 251 | + / f"release_gate_2341_{codec}_{dtype_name}_second_{nonce}.tif" |
| 252 | + ) |
| 253 | + |
| 254 | + source = _make_input(dtype_name) |
| 255 | + is_float = np.issubdtype(np.dtype(dtype_name), np.floating) |
| 256 | + |
| 257 | + # The masking behaviour differs by dtype: integer reads default to |
| 258 | + # masking the sentinel into NaN (which would change dtype and break |
| 259 | + # the byte-exact comparison), so we read integers with |
| 260 | + # ``mask_nodata=False`` to keep the sentinel as a real pixel. |
| 261 | + # Float reads round-trip NaN as NaN regardless of mask_nodata. |
| 262 | + mask_kwargs: dict = {} if is_float else {"mask_nodata": False} |
| 263 | + |
| 264 | + # Pass 1: write the in-memory source. The writer infers NaN as the |
| 265 | + # implicit float sentinel without a ``nodata=`` kwarg, so only the |
| 266 | + # integer branch passes one explicitly. This keeps the test from |
| 267 | + # locking the writer into accepting ``nodata=NaN`` if that ever |
| 268 | + # becomes a no-op or a rejected redundancy. |
| 269 | + pass_one_kwargs: dict = ( |
| 270 | + {} if is_float else {"nodata": source.attrs["nodata"]} |
| 271 | + ) |
| 272 | + to_geotiff( |
| 273 | + source, |
| 274 | + write_first, |
| 275 | + compression=codec, |
| 276 | + tiled=False, |
| 277 | + **pass_one_kwargs, |
| 278 | + ) |
| 279 | + |
| 280 | + baseline = open_geotiff(write_first, **mask_kwargs) |
| 281 | + baseline_pixels = np.asarray(baseline.values) |
| 282 | + baseline_attrs = _canonical_attrs(baseline) |
| 283 | + |
| 284 | + # The on-disk TIFF Compression tag must reflect the requested codec. |
| 285 | + tag_first = _read_tiff_compression_tag(write_first) |
| 286 | + assert tag_first == _CODEC_TO_TIFF_TAG[codec], ( |
| 287 | + f"release gate (#2341): codec {codec!r} encoded as TIFF tag " |
| 288 | + f"{tag_first} on first write; expected " |
| 289 | + f"{_CODEC_TO_TIFF_TAG[codec]} per the codec -> tag map" |
| 290 | + ) |
| 291 | + |
| 292 | + # Pass 2: rewrite the baseline DataArray under the same codec. |
| 293 | + # The baseline DataArray already carries ``attrs['nodata']`` from |
| 294 | + # the first read; the writer picks the sentinel up from the attrs |
| 295 | + # on the float path. For the integer branch we pass the sentinel |
| 296 | + # explicitly so the writer does not need to fall back to a default. |
| 297 | + pass_two_kwargs: dict = ( |
| 298 | + {} if is_float else {"nodata": baseline.attrs.get("nodata")} |
| 299 | + ) |
| 300 | + to_geotiff( |
| 301 | + baseline, |
| 302 | + write_second, |
| 303 | + compression=codec, |
| 304 | + tiled=False, |
| 305 | + **pass_two_kwargs, |
| 306 | + ) |
| 307 | + |
| 308 | + second = open_geotiff(write_second, **mask_kwargs) |
| 309 | + second_pixels = np.asarray(second.values) |
| 310 | + second_attrs = _canonical_attrs(second) |
| 311 | + |
| 312 | + tag_second = _read_tiff_compression_tag(write_second) |
| 313 | + assert tag_second == _CODEC_TO_TIFF_TAG[codec], ( |
| 314 | + f"release gate (#2341): codec {codec!r} encoded as TIFF tag " |
| 315 | + f"{tag_second} on the second write; expected " |
| 316 | + f"{_CODEC_TO_TIFF_TAG[codec]} per the codec -> tag map" |
| 317 | + ) |
| 318 | + |
| 319 | + _assert_pixels_equal( |
| 320 | + second_pixels, baseline_pixels, codec=codec, dtype_name=dtype_name, |
| 321 | + ) |
| 322 | + |
| 323 | + # Per-attribute comparison so a single failing key reports which |
| 324 | + # attr drifted instead of a wholesale dict-equality failure. |
| 325 | + for key in _RELEASE_ATTR_KEYS: |
| 326 | + want = baseline_attrs[key] |
| 327 | + got = second_attrs[key] |
| 328 | + if key == "nodata" and isinstance(want, float) and np.isnan(want): |
| 329 | + assert isinstance(got, float) and np.isnan(got), ( |
| 330 | + f"release gate (#2341): codec {codec!r} dtype " |
| 331 | + f"{dtype_name!r} dropped NaN nodata across the " |
| 332 | + f"round-trip: got {got!r}" |
| 333 | + ) |
| 334 | + continue |
| 335 | + if key == "transform": |
| 336 | + assert want is not None and got is not None, ( |
| 337 | + f"release gate (#2341): codec {codec!r} dtype " |
| 338 | + f"{dtype_name!r} dropped ``attrs['transform']``: " |
| 339 | + f"{want!r} -> {got!r}" |
| 340 | + ) |
| 341 | + assert tuple(got) == tuple(want), ( |
| 342 | + f"release gate (#2341): codec {codec!r} dtype " |
| 343 | + f"{dtype_name!r} drifted ``attrs['transform']``: " |
| 344 | + f"{want!r} -> {got!r}" |
| 345 | + ) |
| 346 | + continue |
| 347 | + assert got == want, ( |
| 348 | + f"release gate (#2341): codec {codec!r} dtype {dtype_name!r} " |
| 349 | + f"drifted ``attrs[{key!r}]`` across the round-trip: " |
| 350 | + f"{want!r} -> {got!r}" |
| 351 | + ) |
| 352 | + |
| 353 | + |
| 354 | +@pytest.mark.release_gate |
| 355 | +def test_release_gate_codec_round_trip_stable_set_matches_supported_features() -> None: |
| 356 | + """The codec list in this file matches ``SUPPORTED_FEATURES``. |
| 357 | +
|
| 358 | + If a codec is promoted into ``stable`` (or demoted out) in |
| 359 | + :data:`xrspatial.geotiff.SUPPORTED_FEATURES` without updating |
| 360 | + this file, the cartesian-product gate is silently out of sync |
| 361 | + with the runtime tier table. Fail loudly here so the PR that |
| 362 | + changes the tier also updates the gate. |
| 363 | + """ |
| 364 | + stable_from_constant = { |
| 365 | + key.split(".", 1)[1] |
| 366 | + for key, tier in SUPPORTED_FEATURES.items() |
| 367 | + if key.startswith("codec.") and tier == "stable" |
| 368 | + } |
| 369 | + assert stable_from_constant == set(STABLE_CODECS), ( |
| 370 | + "release gate (#2341): STABLE_CODECS drifted from " |
| 371 | + "SUPPORTED_FEATURES; the gate and the runtime tier table " |
| 372 | + "must agree on which codecs are stable. " |
| 373 | + f"constant: {set(STABLE_CODECS)!r}; " |
| 374 | + f"SUPPORTED_FEATURES: {stable_from_constant!r}" |
| 375 | + ) |
0 commit comments