|
| 1 | +"""Regression tests for issue #1950. |
| 2 | +
|
| 3 | +``_try_nvcomp_batch_decompress`` used to compute its per-tile host |
| 4 | +prefix-sum offsets via a Python ``for`` loop: |
| 5 | +
|
| 6 | +``` |
| 7 | +comp_sizes_list = [len(t) for t in raw_tiles] |
| 8 | +comp_offsets_h = np.zeros(n_tiles, dtype=np.int64) |
| 9 | +for i in range(1, n_tiles): |
| 10 | + comp_offsets_h[i] = comp_offsets_h[i - 1] + comp_sizes_list[i - 1] |
| 11 | +``` |
| 12 | +
|
| 13 | +The sibling batched-D2H helper ``_batched_d2h_to_bytes`` at ~L924 and |
| 14 | +the compress-side prefix sum in ``_nvcomp_batch_compress`` at ~L2572 |
| 15 | +both use ``np.cumsum(sizes, out=offsets[1:])``. Aligning the |
| 16 | +decompress side keeps the codebase consistent and trims interpreter |
| 17 | +overhead. |
| 18 | +
|
| 19 | +Two guards here: |
| 20 | +
|
| 21 | +1. Correctness -- a tiny synthetic nvCOMP round-trip (when the lib is |
| 22 | + available) still decodes every tile correctly. Without nvCOMP the |
| 23 | + test exercises the same prefix-sum reshape via direct comparison |
| 24 | + against ``np.cumsum``. |
| 25 | +2. Structural -- the source uses ``np.cumsum`` (not a Python |
| 26 | + ``range(1, n_tiles)`` loop) for the prefix sum. |
| 27 | +""" |
| 28 | +from __future__ import annotations |
| 29 | + |
| 30 | +import importlib.util |
| 31 | +import os |
| 32 | +import tempfile |
| 33 | + |
| 34 | +import numpy as np |
| 35 | +import pytest |
| 36 | + |
| 37 | + |
| 38 | +def test_nvcomp_decompress_uses_cumsum_for_offsets_1950(): |
| 39 | + """Source-level guard against reintroducing the Python for loop. |
| 40 | +
|
| 41 | + The fix swaps the per-tile prefix-sum loop for ``np.cumsum``. |
| 42 | + This test fires if anyone reverts to the loop or otherwise breaks |
| 43 | + the alignment with ``_batched_d2h_to_bytes`` / ``_nvcomp_batch_compress``. |
| 44 | + """ |
| 45 | + import pathlib |
| 46 | + |
| 47 | + src_path = pathlib.Path(__file__).parent.parent / "_gpu_decode.py" |
| 48 | + src = src_path.read_text() |
| 49 | + |
| 50 | + # Locate the decompress prefix-sum site. It sits inside |
| 51 | + # ``_try_nvcomp_batch_decompress`` and is anchored by the |
| 52 | + # ``Batch host->device upload`` comment that documents the rationale. |
| 53 | + anchor = "Batch host->device upload: concatenate all compressed tiles" |
| 54 | + idx = src.find(anchor) |
| 55 | + assert idx != -1, "could not locate the decompress upload block" |
| 56 | + # Take a 1500-char window after the anchor; the prefix-sum lives |
| 57 | + # within the first ~30 lines of that block. |
| 58 | + block = src[idx:idx + 1500] |
| 59 | + |
| 60 | + assert "np.cumsum(" in block, ( |
| 61 | + "decompress upload block should use np.cumsum for prefix-sum " |
| 62 | + "offsets, aligning with _batched_d2h_to_bytes (issue #1950)." |
| 63 | + ) |
| 64 | + # The legacy Python loop would have ``for i in range(1, n_tiles):``. |
| 65 | + assert "for i in range(1, n_tiles)" not in block, ( |
| 66 | + "decompress upload block should no longer compute prefix-sum " |
| 67 | + "offsets with a Python for loop (issue #1950)." |
| 68 | + ) |
| 69 | + |
| 70 | + |
| 71 | +def test_cumsum_matches_loop_prefix_sum_1950(): |
| 72 | + """Equivalence between the vectorised cumsum and the prior loop. |
| 73 | +
|
| 74 | + Numeric guard. Even though the two forms produce the same output |
| 75 | + by construction, a runtime check confirms the cumsum form does not |
| 76 | + drift away from the previous semantics across numpy versions. |
| 77 | + """ |
| 78 | + rng = np.random.RandomState(1950) |
| 79 | + n = 1024 |
| 80 | + sizes = rng.randint(100, 100_000, size=n).astype(np.int64) |
| 81 | + |
| 82 | + # Vectorised form (matches the fix). |
| 83 | + offsets_cumsum = np.zeros(n, dtype=np.int64) |
| 84 | + if n > 1: |
| 85 | + np.cumsum(sizes[:-1], out=offsets_cumsum[1:]) |
| 86 | + |
| 87 | + # Reference: explicit Python prefix sum. |
| 88 | + offsets_loop = np.zeros(n, dtype=np.int64) |
| 89 | + for i in range(1, n): |
| 90 | + offsets_loop[i] = offsets_loop[i - 1] + sizes[i - 1] |
| 91 | + |
| 92 | + np.testing.assert_array_equal(offsets_cumsum, offsets_loop) |
| 93 | + |
| 94 | + |
| 95 | +@pytest.mark.skipif( |
| 96 | + importlib.util.find_spec("cupy") is None, |
| 97 | + reason="cupy required for nvCOMP path", |
| 98 | +) |
| 99 | +def test_nvcomp_batch_decompress_roundtrip_1950(): |
| 100 | + """End-to-end check: a deflate-tiled raster still decodes correctly. |
| 101 | +
|
| 102 | + Exercises ``_try_nvcomp_batch_decompress`` on a real file via the |
| 103 | + public ``read_geotiff_gpu`` entry point. If the prefix-sum |
| 104 | + refactor mis-stages a tile, the decoded buffer would not match |
| 105 | + the source, surfacing as a numerical regression here. |
| 106 | +
|
| 107 | + The test is gated on cupy availability rather than the nvCOMP lib |
| 108 | + explicitly because the GPU read path falls back to a CPU codec when |
| 109 | + nvCOMP is missing; in that case the test still exercises the GPU |
| 110 | + upload + tile assembly but bypasses the prefix-sum site directly. |
| 111 | + Setting ``XRSPATIAL_GEOTIFF_STRICT_GPU=1`` would gate harder. |
| 112 | + """ |
| 113 | + try: |
| 114 | + import cupy |
| 115 | + except ImportError: |
| 116 | + pytest.skip("cupy not importable") |
| 117 | + if not cupy.cuda.is_available(): |
| 118 | + pytest.skip("CUDA device not available") |
| 119 | + |
| 120 | + import xarray as xr |
| 121 | + from xrspatial.geotiff import open_geotiff, to_geotiff |
| 122 | + |
| 123 | + rng = np.random.RandomState(1950) |
| 124 | + height, width = 1024, 1024 |
| 125 | + arr = rng.rand(height, width).astype(np.float32) |
| 126 | + da = xr.DataArray( |
| 127 | + arr, dims=["y", "x"], |
| 128 | + coords={"y": np.arange(height), "x": np.arange(width)}, |
| 129 | + attrs={"crs": 4326}, |
| 130 | + ) |
| 131 | + |
| 132 | + with tempfile.TemporaryDirectory() as td: |
| 133 | + path = os.path.join(td, "tmp_1950_deflate.tif") |
| 134 | + to_geotiff(da, path, compression="deflate", tile_size=256) |
| 135 | + |
| 136 | + # Read back through the GPU pipeline. |
| 137 | + result = open_geotiff(path, gpu=True) |
| 138 | + assert result.shape == (height, width) |
| 139 | + decoded = cupy.asnumpy(result.data) if hasattr( |
| 140 | + result.data, "get") else np.asarray(result.data) |
| 141 | + |
| 142 | + np.testing.assert_allclose(decoded, arr, atol=0, rtol=0) |
0 commit comments