Skip to content

Commit add6529

Browse files
authored
geotiff: GPU writer overview loop uses cupy.putmask for in-place NaN rewrite (#1948) (#1952)
1 parent 97e3f15 commit add6529

2 files changed

Lines changed: 213 additions & 6 deletions

File tree

xrspatial/geotiff/_writers/gpu.py

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -479,8 +479,31 @@ def _gpu_compress_to_part(gpu_arr, w, h, spp):
479479
# all-sentinel cell NaN back to the sentinel after each level
480480
# so the on-disk overview tiles still carry the sentinel value
481481
# external readers expect.
482+
#
483+
# The sentinel rewrite uses ``cupy.putmask`` rather than
484+
# ``current.copy(); current[nan_mask] = sentinel`` because
485+
# ``make_overview_gpu`` always returns a freshly allocated
486+
# buffer (``_block_reduce_2d_gpu`` ends in ``cupy.nan*`` /
487+
# ``cupy.around(...).astype(...)`` / ``cropped[::2, ::2].copy()``,
488+
# and the 3-D path ends in ``cupy.stack``) so nothing else
489+
# aliases the buffer between the return and the rewrite.
490+
# Dropping the redundant ``copy()`` skips one chunk-sized GPU
491+
# allocation per overview level. Mirrors the in-place sentinel
492+
# rewrite ``_apply_nodata_mask_gpu`` adopted in #1934. See
493+
# issue #1948.
482494
current = arr
483495
cumulative_factor = 1
496+
# ``make_overview_gpu`` preserves dtype, so the sentinel cast is
497+
# loop-invariant. Hoist it (and the float/finite gate) out of the
498+
# inner ``while`` to skip redundant per-level scalar work.
499+
rewrite_nodata = (
500+
nodata is not None
501+
and np_dtype.kind == 'f'
502+
and not np.isnan(float(nodata))
503+
)
504+
sentinel_scalar = (
505+
np_dtype.type(nodata) if rewrite_nodata else None
506+
)
484507
for target_factor in overview_levels:
485508
# Halve repeatedly until the cumulative decimation matches
486509
# the requested factor. Validation has already established
@@ -490,14 +513,10 @@ def _gpu_compress_to_part(gpu_arr, w, h, spp):
490513
current = make_overview_gpu(current, method=overview_resampling,
491514
nodata=nodata)
492515
cumulative_factor *= 2
493-
if (nodata is not None
494-
and np.dtype(str(current.dtype)).kind == 'f'
495-
and not np.isnan(float(nodata))):
516+
if rewrite_nodata:
496517
nan_mask = cupy.isnan(current)
497518
if bool(nan_mask.any().item()):
498-
current = current.copy()
499-
current[nan_mask] = np.dtype(
500-
str(current.dtype)).type(nodata)
519+
cupy.putmask(current, nan_mask, sentinel_scalar)
501520
oh, ow = current.shape[:2]
502521
parts.append(_gpu_compress_to_part(current, ow, oh, samples))
503522

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
"""Regression tests for issue #1948.
2+
3+
The COG overview generation loop inside ``write_geotiff_gpu`` used to
4+
replace the in-place NaN-to-sentinel rewrite with
5+
``current = current.copy(); current[nan_mask] = sentinel``. Every call
6+
to ``make_overview_gpu`` returns a freshly allocated cupy buffer
7+
(``_block_reduce_2d_gpu`` ends in ``cupy.nan*`` / ``cupy.around(...).astype(...)`` /
8+
``cropped[::2, ::2].copy()``, the 3-D path ends in ``cupy.stack``), so
9+
nothing else aliases the buffer between the return and the rewrite. The
10+
``.copy()`` allocated a second chunk-sized GPU buffer per overview
11+
level for no semantic gain.
12+
13+
The fix swaps the rewrite to ``cupy.putmask(current, nan_mask, sentinel)``
14+
which mutates the existing buffer in place. This drops one chunk-sized
15+
device allocation per overview level and mirrors the in-place sentinel
16+
rewrite ``_apply_nodata_mask_gpu`` adopted in #1934.
17+
18+
Two guards here:
19+
20+
1. Correctness -- a COG write with a sentinel-bearing float raster
21+
round-trips through the GPU writer and back to the CPU reader with
22+
the sentinel preserved in every overview level.
23+
2. Structural -- the writer source uses ``cupy.putmask`` and no longer
24+
carries the redundant ``current = current.copy()`` line in the
25+
overview loop.
26+
"""
27+
from __future__ import annotations
28+
29+
import os
30+
import tempfile
31+
32+
import numpy as np
33+
import xarray as xr
34+
35+
from xrspatial.geotiff.tests.conftest import requires_gpu as _gpu_only
36+
37+
38+
def _make_float_raster_with_nodata(height: int, width: int, sentinel: float):
39+
"""Build a float32 raster whose sentinel pixels would poison overviews.
40+
41+
Reserve a contiguous all-sentinel sub-rectangle so that every 2x
42+
decimation step still contains at least one fully-sentinel 2x2
43+
block. ``_block_reduce_2d_gpu`` masks the sentinel back to NaN
44+
before reducing, so a partially-sentinel block returns a valid mean
45+
and the overview pixel never becomes NaN; only a fully-sentinel
46+
block triggers the NaN->sentinel rewrite branch the fix touches.
47+
"""
48+
import cupy
49+
50+
arr = np.random.RandomState(1948).rand(height, width).astype(np.float32)
51+
# Bottom-right quadrant becomes all-sentinel so every 2x reduction
52+
# of that region stays all-sentinel and the NaN rewrite branch
53+
# fires at every overview level.
54+
arr[height // 2:, width // 2:] = sentinel
55+
return cupy.asarray(arr)
56+
57+
58+
def test_gpu_writer_overview_loop_uses_putmask_1948():
59+
"""Source-level guard against the redundant ``current.copy()``.
60+
61+
The fix replaces the ``current = current.copy(); current[nan_mask] = ...``
62+
pair with ``cupy.putmask(current, nan_mask, ...)`` so the buffer
63+
``make_overview_gpu`` returned is mutated in place. This guard
64+
fires if anyone reintroduces the redundant copy.
65+
"""
66+
import pathlib
67+
68+
src_path = pathlib.Path(__file__).parent.parent / "_writers" / "gpu.py"
69+
src = src_path.read_text()
70+
# The overview loop should call cupy.putmask, not current.copy() + indexed write.
71+
assert "cupy.putmask(" in src, (
72+
"write_geotiff_gpu overview loop should use cupy.putmask "
73+
"for the in-place NaN->sentinel rewrite (issue #1948)."
74+
)
75+
# Confirm the in-place pattern is the one inside the overview branch,
76+
# not somewhere unrelated. The pattern's location follows the
77+
# ``make_overview_gpu`` call and the ``nan_mask = cupy.isnan(current)``
78+
# gate; assert the order is preserved.
79+
idx_overview = src.find("make_overview_gpu(current")
80+
idx_putmask = src.find("cupy.putmask(", idx_overview)
81+
assert idx_overview != -1 and idx_putmask != -1, (
82+
"could not locate the overview-loop in-place rewrite"
83+
)
84+
# The legacy two-line pattern would have ``current = current.copy()``
85+
# right before the indexed write. Ensure the overview branch no
86+
# longer contains that exact line. Anchor the slice on the next
87+
# statement after the inner ``while`` (``parts.append(...)``) so
88+
# the window tracks the real loop body instead of a fixed
89+
# character count that drifts as surrounding code changes.
90+
idx_parts_append = src.find("parts.append(", idx_overview)
91+
assert idx_parts_append != -1, (
92+
"could not locate the ``parts.append(`` sentinel that closes "
93+
"the overview-loop body"
94+
)
95+
overview_branch = src[idx_overview:idx_parts_append]
96+
assert "current = current.copy()" not in overview_branch, (
97+
"overview loop should no longer copy the cupy buffer before "
98+
"the in-place sentinel rewrite (issue #1948)."
99+
)
100+
101+
102+
@_gpu_only
103+
def test_gpu_writer_cog_overview_sentinel_roundtrip_1948():
104+
"""COG write -> CPU read preserves the sentinel through every overview.
105+
106+
Regression check on the correctness of the in-place rewrite. If
107+
``cupy.putmask`` ever drifts away from the ``current.copy()`` +
108+
indexed-write semantics (e.g. broadcasts the sentinel as the wrong
109+
dtype) the round-trip would surface NaN where the sentinel should
110+
be, breaking external-reader masking. The test is gated on a GPU
111+
because the overview loop only runs on the GPU writer code path.
112+
"""
113+
from xrspatial.geotiff import open_geotiff, write_geotiff_gpu
114+
115+
sentinel = np.float32(-9999.0)
116+
height, width = 1024, 1024
117+
arr_gpu = _make_float_raster_with_nodata(height, width, float(sentinel))
118+
119+
da = xr.DataArray(
120+
arr_gpu, dims=["y", "x"],
121+
coords={"y": np.arange(height), "x": np.arange(width)},
122+
attrs={"crs": 4326, "nodata": float(sentinel)},
123+
)
124+
125+
with tempfile.TemporaryDirectory() as td:
126+
path = os.path.join(td, "tmp_1948_cog.tif")
127+
write_geotiff_gpu(
128+
da, path, compression="deflate", tile_size=256,
129+
cog=True, overview_levels=[2, 4],
130+
)
131+
132+
# Read full-resolution and the two overview levels.
133+
full = open_geotiff(path)
134+
ov1 = open_geotiff(path, overview_level=1)
135+
ov2 = open_geotiff(path, overview_level=2)
136+
137+
# Full-resolution: sentinel pixels survive as NaN (the read path
138+
# masks the sentinel back to NaN since attrs['nodata'] is set).
139+
assert np.isnan(full.values).any(), (
140+
"full-resolution overview lost its sentinel pixels"
141+
)
142+
assert ov1.shape == (height // 2, width // 2), (
143+
f"overview level 1 shape {ov1.shape}, expected "
144+
f"({height // 2}, {width // 2})"
145+
)
146+
assert ov2.shape == (height // 4, width // 4), (
147+
f"overview level 2 shape {ov2.shape}, expected "
148+
f"({height // 4}, {width // 4})"
149+
)
150+
# Each overview must still mask the sentinel. With the prior
151+
# ``current.copy()`` pattern the test passed because the buffer was
152+
# copied; with ``cupy.putmask`` the test still passes because the
153+
# buffer is mutated in place. This guard fires only if the rewrite
154+
# is dropped entirely or sentinel dtype/promotion semantics change.
155+
assert np.isnan(ov1.values).any(), (
156+
"overview level 1 lost its sentinel pixels after the in-place rewrite"
157+
)
158+
assert np.isnan(ov2.values).any(), (
159+
"overview level 2 lost its sentinel pixels after the in-place rewrite"
160+
)
161+
162+
163+
@_gpu_only
164+
def test_gpu_writer_overview_uses_make_overview_gpu_fresh_buffer_1948():
165+
"""``make_overview_gpu`` returns a freshly allocated cupy buffer.
166+
167+
The in-place rewrite contract relies on ``make_overview_gpu``
168+
handing back a buffer that no caller-visible state aliases. This
169+
guard exercises every overview-method path inside
170+
``_block_reduce_2d_gpu`` and confirms the returned buffer is a
171+
different cupy.ndarray than the input.
172+
"""
173+
import cupy
174+
175+
from xrspatial.geotiff._gpu_decode import make_overview_gpu
176+
177+
rng = np.random.RandomState(1948)
178+
src_np = rng.rand(64, 64).astype(np.float32)
179+
src_gpu = cupy.asarray(src_np)
180+
181+
for method in ("mean", "nearest", "min", "max", "median"):
182+
out = make_overview_gpu(src_gpu, method=method, nodata=None)
183+
assert int(out.data.ptr) != int(src_gpu.data.ptr), (
184+
f"make_overview_gpu(method={method!r}) returned an aliased buffer"
185+
)
186+
assert out.shape == (32, 32), (
187+
f"make_overview_gpu(method={method!r}) returned shape {out.shape}"
188+
)

0 commit comments

Comments
 (0)