Skip to content

Commit 19fc4db

Browse files
authored
Add stable-codec read/write/read round-trip release gate (#2360) (#2365)
1 parent 896fba4 commit 19fc4db

2 files changed

Lines changed: 382 additions & 0 deletions

File tree

docs/source/reference/release_gate_geotiff.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,13 @@ Local GeoTIFF read and write
122122
- Lossless byte-for-byte round-trip on integer and float dtypes.
123123
- ``xrspatial/geotiff/tests/test_supported_features_tiers_2137.py``,
124124
``xrspatial/geotiff/tests/test_compression.py``
125+
* - Stable codec round-trip (read / write / read)
126+
- stable
127+
- For every stable codec * promised dtype combination, a full
128+
write / read / write / read cycle preserves byte-exact pixels
129+
(NaN-aware for float) and the canonical release attrs. See
130+
the cited test for the codec, dtype, and attr-key matrix.
131+
- ``xrspatial/geotiff/tests/test_release_gate_codec_round_trip_2341.py``
125132
* - Codec ``lerc`` / ``jpeg2000`` / ``j2k`` / ``lz4``
126133
- experimental
127134
- Rejected by default; accepted with
Lines changed: 375 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,375 @@
1+
"""Release gate: stable-codec read/write/read round-trip (epic #2341).
2+
3+
PR 4 of 5 of epic #2341. The release contract names a specific set of
4+
codecs as ``stable`` in :data:`xrspatial.geotiff.SUPPORTED_FEATURES`:
5+
``none``, ``deflate``, ``lzw``, ``zstd``, ``packbits``. The release
6+
notes promise that on any of these codecs, a round-trip preserves both
7+
bit-exact pixels AND every canonical release attr key, on every dtype
8+
the library promises to round-trip.
9+
10+
Existing tests split the contract:
11+
12+
* ``test_compression.py`` covers codec internals (LZW dictionary edge
13+
cases, PackBits boundary cases, deflate stream framing).
14+
* ``test_supported_features_tiers_2137.py`` pins the
15+
``SUPPORTED_FEATURES`` tier table.
16+
* ``test_release_gate_codecs.py`` pins lossless pixel round-trip for
17+
two dtypes (``uint16``, ``float32``).
18+
19+
This file is the joint gate: the cartesian product of every stable
20+
codec with every promised dtype, asserting both pixel equality AND
21+
release-attr equality through a full read/write/read cycle.
22+
23+
Out of scope:
24+
25+
* Experimental codecs (``lerc``, ``jpeg2000``, ``j2k``, ``lz4``) --
26+
release tier is ``experimental``; covered by
27+
``test_supported_features_tiers_2137.py``.
28+
* Internal-only ``jpeg`` -- not part of the public surface.
29+
* COG layout (``test_release_gate_cog.py``).
30+
* Backend parity (``test_backend_parity_matrix.py``).
31+
"""
32+
from __future__ import annotations
33+
34+
import uuid
35+
36+
import numpy as np
37+
import pytest
38+
import xarray as xr
39+
40+
from xrspatial.geotiff import SUPPORTED_FEATURES, open_geotiff, to_geotiff
41+
from xrspatial.geotiff._compression import (COMPRESSION_DEFLATE, COMPRESSION_LZW,
42+
COMPRESSION_NONE, COMPRESSION_PACKBITS,
43+
COMPRESSION_ZSTD)
44+
from xrspatial.geotiff._header import parse_header, parse_ifd
45+
46+
# The stable lossless codec set. Kept in lockstep with the ``codec.*``
47+
# entries tiered ``stable`` in
48+
# :data:`xrspatial.geotiff.SUPPORTED_FEATURES`. The drift guard at the
49+
# bottom of this file fails the build if the two sets disagree.
50+
STABLE_CODECS = ("none", "deflate", "lzw", "zstd", "packbits")
51+
52+
# The dtype set the release contract promises to round-trip through
53+
# every stable codec. ``int16`` and ``int32`` exercise the signed
54+
# integer path; ``float32`` and ``float64`` exercise the IEEE float
55+
# path with NaN as the nodata sentinel.
56+
DTYPES = ("int16", "int32", "float32", "float64")
57+
58+
# TIFF tag value the on-disk file should carry for each stable codec
59+
# name. The reader IFD parser exposes ``ifd.compression`` so we can
60+
# assert the on-disk tag without depending on a high-level
61+
# ``attrs['compression']`` key (none exists; see issue #2341).
62+
_CODEC_TO_TIFF_TAG = {
63+
"none": COMPRESSION_NONE,
64+
"deflate": COMPRESSION_DEFLATE,
65+
"lzw": COMPRESSION_LZW,
66+
"zstd": COMPRESSION_ZSTD,
67+
"packbits": COMPRESSION_PACKBITS,
68+
}
69+
70+
# Per-dtype integer nodata sentinel. Float dtypes use NaN. The
71+
# integer sentinels are well outside the natural value range of the
72+
# fixture below (small ascending integers) so the sentinel never
73+
# collides with a real pixel.
74+
_INT_NODATA = {
75+
"int16": np.int16(-32768),
76+
"int32": np.int32(-2147483648),
77+
}
78+
79+
# Release-attr keys the cartesian-product gate asserts on. These come
80+
# from the issue body (#2341) and from the canonical attrs the reader
81+
# emits (see ``test_release_gate_attrs_contract.py``). ``raster_type``
82+
# is included even though it is only emitted when the source was
83+
# ``RasterPixelIsPoint``; we use a small fixture that defaults to
84+
# ``'area'`` so it is normalized below in ``_canonical_attrs``.
85+
_RELEASE_ATTR_KEYS = (
86+
"transform",
87+
"crs",
88+
"crs_wkt",
89+
"nodata",
90+
"masked_nodata",
91+
"georef_status",
92+
"raster_type",
93+
)
94+
95+
96+
def _make_input(dtype_name: str) -> xr.DataArray:
97+
"""Build a 128x128 DataArray of the given dtype.
98+
99+
Float arrays seed a NaN sentinel at (0, 0); integer arrays seed
100+
the per-dtype sentinel at (0, 0). The remaining pixels are a
101+
deterministic, non-trivial pattern so a per-axis flip or stride
102+
bug surfaces as a pixel mismatch.
103+
"""
104+
dtype = np.dtype(dtype_name)
105+
height, width = 128, 128
106+
n = height * width
107+
if np.issubdtype(dtype, np.floating):
108+
arr = np.linspace(-100.0, 100.0, n, dtype=dtype).reshape(height, width)
109+
arr[0, 0] = np.nan
110+
nodata: float | int = float("nan")
111+
else:
112+
# Small positive ramp so the dtype min sentinel never collides
113+
# with a real pixel. The ramp climbs to ``n - 1 == 16383`` with
114+
# the 128*128 fixture, which fits in ``int16`` (max 32767). If
115+
# a future dtype with a smaller positive range is added (e.g.
116+
# ``int8``) the ramp would wrap and collide with the sentinel;
117+
# cap the ramp or shrink the fixture in that case.
118+
arr = np.arange(n, dtype=dtype).reshape(height, width)
119+
sentinel = _INT_NODATA[dtype_name]
120+
arr[0, 0] = sentinel
121+
nodata = sentinel
122+
123+
# 30 m pixels with a descending y axis (top-left at the highest y
124+
# coord). The writer turns these into a GeoTransform of
125+
# ``(30, 0, origin_x, 0, -30, origin_y)``.
126+
y = 4000000.0 - 30.0 * (np.arange(height) + 0.5)
127+
x = 500000.0 + 30.0 * (np.arange(width) + 0.5)
128+
attrs: dict = {"crs": 32610, "nodata": nodata}
129+
return xr.DataArray(
130+
arr,
131+
dims=("y", "x"),
132+
coords={"y": y, "x": x},
133+
attrs=attrs,
134+
)
135+
136+
137+
def _canonical_attrs(da: xr.DataArray) -> dict:
138+
"""Project a DataArray's ``attrs`` onto the release-attr key set.
139+
140+
``raster_type`` is missing from ``attrs`` for the default ``area``
141+
raster (the writer only stamps ``'point'`` explicitly); normalize
142+
here so the cross-read comparison can treat the missing key as
143+
equivalent to ``'area'``.
144+
"""
145+
out = {}
146+
for key in _RELEASE_ATTR_KEYS:
147+
if key == "raster_type":
148+
out[key] = da.attrs.get("raster_type", "area")
149+
else:
150+
out[key] = da.attrs.get(key)
151+
return out
152+
153+
154+
def _read_tiff_compression_tag(path: str) -> int:
155+
"""Read the on-disk TIFF Compression tag from the first IFD.
156+
157+
The reader's high-level API does not surface ``attrs['compression']``
158+
(issue #2341 question). Inspect the IFD directly so the test pins
159+
the actual on-disk codec choice rather than relying on the
160+
DataArray attrs the reader emits.
161+
"""
162+
with open(path, "rb") as fh:
163+
data = fh.read()
164+
header = parse_header(data)
165+
ifd = parse_ifd(data, header.first_ifd_offset, header)
166+
return ifd.compression
167+
168+
169+
def _assert_pixels_equal(actual: np.ndarray, expected: np.ndarray,
170+
*, codec: str, dtype_name: str) -> None:
171+
"""NaN-aware byte-exact pixel comparison.
172+
173+
The float path uses ``equal_nan=True`` so the NaN sentinel
174+
matches NaN-to-NaN. The integer path uses strict
175+
``array_equal`` -- the sentinel is just another integer value
176+
and must round-trip bit-exact.
177+
"""
178+
assert actual.shape == expected.shape, (
179+
f"release gate (#2341): codec {codec!r} dtype {dtype_name!r} "
180+
f"reshaped the array across the round-trip: "
181+
f"{expected.shape} -> {actual.shape}"
182+
)
183+
assert actual.dtype == expected.dtype, (
184+
f"release gate (#2341): codec {codec!r} promoted dtype "
185+
f"{dtype_name!r} to {actual.dtype!r} across the round-trip"
186+
)
187+
if np.issubdtype(expected.dtype, np.floating):
188+
equal = np.array_equal(actual, expected, equal_nan=True)
189+
else:
190+
equal = np.array_equal(actual, expected)
191+
if not equal:
192+
# Surface the first divergent pixel so a debug session can
193+
# jump straight to the offending tile / row.
194+
if np.issubdtype(expected.dtype, np.floating):
195+
mismatch_mask = ~(
196+
(actual == expected) | (np.isnan(actual) & np.isnan(expected))
197+
)
198+
else:
199+
mismatch_mask = actual != expected
200+
first = np.argwhere(mismatch_mask)
201+
first_idx = tuple(int(v) for v in first[0]) if first.size else None
202+
first_actual = (
203+
actual[first_idx] if first_idx is not None else None
204+
)
205+
first_expected = (
206+
expected[first_idx] if first_idx is not None else None
207+
)
208+
raise AssertionError(
209+
f"release gate (#2341): codec {codec!r} did not preserve "
210+
f"{dtype_name!r} pixels byte-for-byte; the release contract "
211+
f"names this codec as lossless for this dtype. First "
212+
f"divergence at index {first_idx!r}: actual="
213+
f"{first_actual!r}, expected={first_expected!r}"
214+
)
215+
216+
217+
@pytest.mark.release_gate
218+
@pytest.mark.parametrize("dtype_name", DTYPES)
219+
@pytest.mark.parametrize("codec", STABLE_CODECS)
220+
def test_release_gate_codec_round_trip(tmp_path, codec, dtype_name) -> None:
221+
"""Stable codec * dtype: pixels and release attrs survive a full
222+
read/write/read cycle.
223+
224+
Steps:
225+
226+
1. Build an in-memory DataArray with a known transform, CRS, and
227+
nodata sentinel (NaN for float; per-dtype int min for int).
228+
2. Write via ``to_geotiff(path, compression=codec)``.
229+
3. Read back via ``open_geotiff(path)`` -- this is the canonical
230+
baseline. The reader fills in ``crs_wkt``,
231+
``georef_status``, ``masked_nodata``, etc.
232+
4. Write the baseline DataArray to a second path under the same
233+
codec.
234+
5. Read the second path back; assert byte-exact pixels and every
235+
release-attr key matches the baseline.
236+
237+
The two-pass shape is what makes this a *round-trip* gate
238+
rather than a single-pass write-and-read gate: the canonical
239+
attrs themselves have to survive the second cycle, not just the
240+
first.
241+
"""
242+
# Unique tag per parametrized case so parallel pytest workers and
243+
# parallel rockout worktrees never collide on the same tmp file.
244+
nonce = uuid.uuid4().hex[:8]
245+
write_first = str(
246+
tmp_path
247+
/ f"release_gate_2341_{codec}_{dtype_name}_first_{nonce}.tif"
248+
)
249+
write_second = str(
250+
tmp_path
251+
/ f"release_gate_2341_{codec}_{dtype_name}_second_{nonce}.tif"
252+
)
253+
254+
source = _make_input(dtype_name)
255+
is_float = np.issubdtype(np.dtype(dtype_name), np.floating)
256+
257+
# The masking behaviour differs by dtype: integer reads default to
258+
# masking the sentinel into NaN (which would change dtype and break
259+
# the byte-exact comparison), so we read integers with
260+
# ``mask_nodata=False`` to keep the sentinel as a real pixel.
261+
# Float reads round-trip NaN as NaN regardless of mask_nodata.
262+
mask_kwargs: dict = {} if is_float else {"mask_nodata": False}
263+
264+
# Pass 1: write the in-memory source. The writer infers NaN as the
265+
# implicit float sentinel without a ``nodata=`` kwarg, so only the
266+
# integer branch passes one explicitly. This keeps the test from
267+
# locking the writer into accepting ``nodata=NaN`` if that ever
268+
# becomes a no-op or a rejected redundancy.
269+
pass_one_kwargs: dict = (
270+
{} if is_float else {"nodata": source.attrs["nodata"]}
271+
)
272+
to_geotiff(
273+
source,
274+
write_first,
275+
compression=codec,
276+
tiled=False,
277+
**pass_one_kwargs,
278+
)
279+
280+
baseline = open_geotiff(write_first, **mask_kwargs)
281+
baseline_pixels = np.asarray(baseline.values)
282+
baseline_attrs = _canonical_attrs(baseline)
283+
284+
# The on-disk TIFF Compression tag must reflect the requested codec.
285+
tag_first = _read_tiff_compression_tag(write_first)
286+
assert tag_first == _CODEC_TO_TIFF_TAG[codec], (
287+
f"release gate (#2341): codec {codec!r} encoded as TIFF tag "
288+
f"{tag_first} on first write; expected "
289+
f"{_CODEC_TO_TIFF_TAG[codec]} per the codec -> tag map"
290+
)
291+
292+
# Pass 2: rewrite the baseline DataArray under the same codec.
293+
# The baseline DataArray already carries ``attrs['nodata']`` from
294+
# the first read; the writer picks the sentinel up from the attrs
295+
# on the float path. For the integer branch we pass the sentinel
296+
# explicitly so the writer does not need to fall back to a default.
297+
pass_two_kwargs: dict = (
298+
{} if is_float else {"nodata": baseline.attrs.get("nodata")}
299+
)
300+
to_geotiff(
301+
baseline,
302+
write_second,
303+
compression=codec,
304+
tiled=False,
305+
**pass_two_kwargs,
306+
)
307+
308+
second = open_geotiff(write_second, **mask_kwargs)
309+
second_pixels = np.asarray(second.values)
310+
second_attrs = _canonical_attrs(second)
311+
312+
tag_second = _read_tiff_compression_tag(write_second)
313+
assert tag_second == _CODEC_TO_TIFF_TAG[codec], (
314+
f"release gate (#2341): codec {codec!r} encoded as TIFF tag "
315+
f"{tag_second} on the second write; expected "
316+
f"{_CODEC_TO_TIFF_TAG[codec]} per the codec -> tag map"
317+
)
318+
319+
_assert_pixels_equal(
320+
second_pixels, baseline_pixels, codec=codec, dtype_name=dtype_name,
321+
)
322+
323+
# Per-attribute comparison so a single failing key reports which
324+
# attr drifted instead of a wholesale dict-equality failure.
325+
for key in _RELEASE_ATTR_KEYS:
326+
want = baseline_attrs[key]
327+
got = second_attrs[key]
328+
if key == "nodata" and isinstance(want, float) and np.isnan(want):
329+
assert isinstance(got, float) and np.isnan(got), (
330+
f"release gate (#2341): codec {codec!r} dtype "
331+
f"{dtype_name!r} dropped NaN nodata across the "
332+
f"round-trip: got {got!r}"
333+
)
334+
continue
335+
if key == "transform":
336+
assert want is not None and got is not None, (
337+
f"release gate (#2341): codec {codec!r} dtype "
338+
f"{dtype_name!r} dropped ``attrs['transform']``: "
339+
f"{want!r} -> {got!r}"
340+
)
341+
assert tuple(got) == tuple(want), (
342+
f"release gate (#2341): codec {codec!r} dtype "
343+
f"{dtype_name!r} drifted ``attrs['transform']``: "
344+
f"{want!r} -> {got!r}"
345+
)
346+
continue
347+
assert got == want, (
348+
f"release gate (#2341): codec {codec!r} dtype {dtype_name!r} "
349+
f"drifted ``attrs[{key!r}]`` across the round-trip: "
350+
f"{want!r} -> {got!r}"
351+
)
352+
353+
354+
@pytest.mark.release_gate
355+
def test_release_gate_codec_round_trip_stable_set_matches_supported_features() -> None:
356+
"""The codec list in this file matches ``SUPPORTED_FEATURES``.
357+
358+
If a codec is promoted into ``stable`` (or demoted out) in
359+
:data:`xrspatial.geotiff.SUPPORTED_FEATURES` without updating
360+
this file, the cartesian-product gate is silently out of sync
361+
with the runtime tier table. Fail loudly here so the PR that
362+
changes the tier also updates the gate.
363+
"""
364+
stable_from_constant = {
365+
key.split(".", 1)[1]
366+
for key, tier in SUPPORTED_FEATURES.items()
367+
if key.startswith("codec.") and tier == "stable"
368+
}
369+
assert stable_from_constant == set(STABLE_CODECS), (
370+
"release gate (#2341): STABLE_CODECS drifted from "
371+
"SUPPORTED_FEATURES; the gate and the runtime tier table "
372+
"must agree on which codecs are stable. "
373+
f"constant: {set(STABLE_CODECS)!r}; "
374+
f"SUPPORTED_FEATURES: {stable_from_constant!r}"
375+
)

0 commit comments

Comments
 (0)