Skip to content

Commit 1ada658

Browse files
authored
geotiff: refuse non-numeric nodata / _FillValue early (#1973) (#1983)
* geotiff: refuse non-numeric nodata / _FillValue early (#1973) attrs['_FillValue']='missing' (or attrs['nodata']='missing', or a non-numeric nodata= kwarg) used to crash deep inside the writer with ``ufunc 'isnan' not supported``. The TypeError did not name the offending attribute and gave no hint that nodata was the cause. Add _validate_nodata_arg in _validation.py and call it from to_geotiff, _write_vrt_tiled, and write_geotiff_gpu so the kwarg path is checked at the boundary. Tighten _resolve_nodata_attr to raise ValueError naming the bad attr when attrs['nodata'] or attrs['_FillValue'] is non-numeric. The rioxarray-style attrs['nodatavals'] branch keeps its existing skip-on-non-numeric behaviour (per-band tuples often carry placeholder entries from arbitrary upstream pipelines, so a single bad entry should not block writing). * geotiff (#1973): reject nodata=True in validator and warn on all-non-numeric nodatavals - add an isinstance(nodata, bool) guard to _validate_nodata_arg so the GPU and VRT writers refuse nodata=True with the same TypeError the eager writer already raises for #1911; previously float(True) == 1.0 silently coerced the bool past the numeric branch on those paths - warn (UserWarning) from _resolve_nodata_attr when every attrs['nodatavals'] entry is non-numeric, since a tuple with zero usable sentinels is more likely a user error than an intentional no-sentinel signal; return value stays None to honour the function's skip-on-non-numeric contract - factor the duplicate non-numeric error string used by the attrs['nodata'] and attrs['_FillValue'] branches into a shared _nodata_attr_non_numeric_msg helper - add regression tests for the bool branch (validator + eager / VRT / GPU entry points), the all-non-numeric warning, and the no-warn paths (usable entry present, all-NaN tuple)
1 parent 66eb3b0 commit 1ada658

5 files changed

Lines changed: 259 additions & 2 deletions

File tree

xrspatial/geotiff/_attrs.py

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
"""
1818
from __future__ import annotations
1919

20+
import warnings
21+
2022
import numpy as np
2123

2224
from ._coords import (
@@ -206,6 +208,10 @@ def _resolve_nodata_attr(attrs: dict):
206208
"""
207209
nodata = attrs.get('nodata')
208210
if nodata is not None:
211+
try:
212+
float(nodata)
213+
except (TypeError, ValueError) as e:
214+
raise ValueError(_nodata_attr_non_numeric_msg('nodata', nodata)) from e
209215
return nodata
210216

211217
vals = attrs.get('nodatavals')
@@ -214,30 +220,59 @@ def _resolve_nodata_attr(attrs: dict):
214220
seq = list(vals)
215221
except TypeError:
216222
seq = [vals]
223+
saw_non_numeric = False
217224
for v in seq:
218225
if v is None:
219226
continue
220227
try:
221228
fv = float(v)
222229
except (TypeError, ValueError):
230+
saw_non_numeric = True
223231
continue
224232
if np.isnan(fv):
225233
continue
226234
return v
235+
# A tuple where every entry is non-numeric is almost certainly a
236+
# user error (typo, stringified sentinel) rather than a legitimate
237+
# "no sentinel" signal. Warn so the caller sees it, but still fall
238+
# through to the rest of the resolution chain rather than raising:
239+
# the rest of the function's contract is "skip non-numeric entries".
240+
if saw_non_numeric:
241+
warnings.warn(
242+
f"attrs['nodatavals']={vals!r} contained only non-numeric "
243+
f"entries; no usable sentinel could be resolved from it. "
244+
f"Pass ``nodata=`` explicitly or fix the attr.",
245+
UserWarning,
246+
stacklevel=2,
247+
)
227248

228249
fill = attrs.get('_FillValue')
229250
if fill is not None:
230251
try:
231252
ffv = float(fill)
232-
except (TypeError, ValueError):
233-
return fill # non-numeric -- pass through verbatim
253+
except (TypeError, ValueError) as e:
254+
raise ValueError(_nodata_attr_non_numeric_msg('_FillValue', fill)) from e
234255
if np.isnan(ffv):
235256
return None
236257
return fill
237258

238259
return None
239260

240261

262+
def _nodata_attr_non_numeric_msg(attr_name: str, value) -> str:
263+
"""Error string shared by the ``attrs['nodata']`` and ``attrs['_FillValue']``
264+
non-numeric branches in ``_resolve_nodata_attr`` (#1973)."""
265+
return (
266+
f"attrs[{attr_name!r}]={value!r} is not numeric "
267+
f"({type(value).__name__}). The writer needs a numeric "
268+
f"sentinel to compare against pixel values; passing a "
269+
f"non-numeric value would otherwise crash inside "
270+
f"``np.isnan`` with an opaque ufunc error. Drop the "
271+
f"attr, replace it with a numeric sentinel, or pass "
272+
f"``nodata=`` explicitly (issue #1973)."
273+
)
274+
275+
241276
def _merge_friendly_extra_tags(extra_tags_list, attrs: dict) -> list | None:
242277
"""Combine ``attrs['extra_tags']`` with friendly tag attrs.
243278

xrspatial/geotiff/_validation.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -269,3 +269,36 @@ def _validate_predictor_sample_format(predictor, sample_format) -> None:
269269
f"pair, e.g. `gdal_translate -co PREDICTOR=2` for integers or "
270270
f"`-co PREDICTOR=1` to drop the predictor."
271271
)
272+
273+
274+
def _validate_nodata_arg(nodata) -> None:
275+
"""Reject non-numeric ``nodata=`` at the writer entry point (#1973).
276+
277+
``None`` (no sentinel) passes through. ``bool`` is rejected with
278+
``TypeError`` so all three writer entry points (eager, GPU, VRT)
279+
refuse ``nodata=True`` / ``nodata=False`` the same way the eager
280+
path already does for issue #1911 -- ``float(True) == 1.0`` would
281+
otherwise slip a bool past the numeric branch on the GPU/VRT paths
282+
that do not have their own bool guard. Anything else is run
283+
through ``float()``: success means the writer's downstream
284+
``np.isnan(nodata)`` and integer-cast paths will not blow up.
285+
Failure raises ``ValueError`` with the offending repr, so users
286+
see ``nodata='missing'`` flagged at the boundary instead of an
287+
opaque ``ufunc 'isnan' not supported`` TypeError from inside the
288+
writer.
289+
"""
290+
if nodata is None:
291+
return
292+
if isinstance(nodata, (bool, np.bool_)):
293+
raise TypeError(
294+
f"nodata must be numeric (int or float), got {nodata!r}")
295+
try:
296+
float(nodata)
297+
except (TypeError, ValueError) as e:
298+
raise ValueError(
299+
f"nodata must be numeric or None, got {nodata!r} "
300+
f"(type {type(nodata).__name__}). The writer compares it "
301+
f"against pixel values via ``np.isnan`` and casts it to "
302+
f"the array dtype; a non-numeric value would otherwise "
303+
f"crash inside NumPy with a ufunc TypeError."
304+
) from e

xrspatial/geotiff/_writers/eager.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
)
4343
from .._validation import (
4444
_validate_3d_writer_dims,
45+
_validate_nodata_arg,
4546
_validate_tile_size_arg,
4647
)
4748
from .._writer import write
@@ -267,6 +268,8 @@ def to_geotiff(data: xr.DataArray | np.ndarray,
267268
if tiled:
268269
_validate_tile_size_arg(tile_size)
269270

271+
_validate_nodata_arg(nodata)
272+
270273
# Up-front validation: catch bad compression names before they reach
271274
# any of the deeper write paths (streaming, GPU, VRT, COG) where the
272275
# error surfaces from _compression_tag with a less obvious traceback.
@@ -781,6 +784,7 @@ def _write_vrt_tiled(data, vrt_path, *, crs=None, nodata=None,
781784
This enables streaming dask arrays to disk without materializing the
782785
full array in RAM.
783786
"""
787+
_validate_nodata_arg(nodata)
784788
# Validate compression_level against codec-specific range
785789
if compression_level is not None:
786790
level_range = _LEVEL_RANGES.get(compression.lower())

xrspatial/geotiff/_writers/gpu.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
from .._runtime import GeoTIFFFallbackWarning
2828
from .._validation import (
2929
_validate_3d_writer_dims,
30+
_validate_nodata_arg,
3031
_validate_tile_size_arg,
3132
)
3233

@@ -259,6 +260,7 @@ def write_geotiff_gpu(data: xr.DataArray | cupy.ndarray | np.ndarray,
259260
# write_geotiff_gpu is always tiled, so validate tile_size here and
260261
# keep parity with the public to_geotiff entry point.
261262
_validate_tile_size_arg(tile_size)
263+
_validate_nodata_arg(nodata)
262264
if max_z_error < 0:
263265
raise ValueError(
264266
f"max_z_error must be >= 0, got {max_z_error}")
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
"""Refuse non-numeric ``nodata=`` / ``attrs['_FillValue']`` (#1973).
2+
3+
The writer compares the resolved nodata against pixel values via
4+
``np.isnan`` and casts it to the array dtype. A non-numeric value
5+
used to fall through ``_resolve_nodata_attr`` (returned verbatim) or
6+
the ``nodata=`` kwarg path and then crash inside NumPy with
7+
``ufunc 'isnan' not supported``. Both the entry point and the attr
8+
resolution path now refuse non-numeric values up front with a clear
9+
error.
10+
"""
11+
from __future__ import annotations
12+
13+
import importlib.util
14+
import io
15+
16+
import numpy as np
17+
import pytest
18+
import xarray as xr
19+
20+
from xrspatial.geotiff import to_geotiff
21+
from xrspatial.geotiff._attrs import _resolve_nodata_attr
22+
from xrspatial.geotiff._validation import _validate_nodata_arg
23+
24+
25+
def _gpu_available() -> bool:
26+
if importlib.util.find_spec("cupy") is None:
27+
return False
28+
try:
29+
import cupy
30+
return bool(cupy.cuda.is_available())
31+
except Exception:
32+
return False
33+
34+
35+
_HAS_GPU = _gpu_available()
36+
_gpu_only = pytest.mark.skipif(not _HAS_GPU, reason="cupy + CUDA required")
37+
38+
39+
def _nan_square():
40+
return xr.DataArray(
41+
np.full((4, 4), np.nan, dtype=np.float32),
42+
coords={'y': np.arange(4.0), 'x': np.arange(4.0)},
43+
dims=('y', 'x'),
44+
)
45+
46+
47+
@pytest.mark.parametrize("bad", ['missing', object(), [1, 2]])
48+
def test_validate_nodata_arg_rejects_non_numeric(bad):
49+
with pytest.raises(ValueError, match="nodata must be numeric"):
50+
_validate_nodata_arg(bad)
51+
52+
53+
@pytest.mark.parametrize("ok", [None, 0, -9999, 1.5, np.float32(-1), np.int64(0)])
54+
def test_validate_nodata_arg_accepts_numeric_and_none(ok):
55+
_validate_nodata_arg(ok)
56+
57+
58+
def test_resolve_nodata_attr_rejects_non_numeric_fillvalue():
59+
with pytest.raises(ValueError, match="_FillValue"):
60+
_resolve_nodata_attr({'_FillValue': 'missing'})
61+
62+
63+
def test_resolve_nodata_attr_rejects_non_numeric_nodata_attr():
64+
with pytest.raises(ValueError, match=r"attrs\['nodata'\]"):
65+
_resolve_nodata_attr({'nodata': 'missing'})
66+
67+
68+
def test_resolve_nodata_attr_skips_non_numeric_in_nodatavals():
69+
# nodatavals (rioxarray's per-band tuple) keeps its skip-on-non-numeric
70+
# behaviour: those values often come from arbitrary upstream pipelines
71+
# and a single bad entry should not block writing.
72+
assert _resolve_nodata_attr({'nodatavals': ('NaN ', -9999.0)}) == -9999.0
73+
74+
75+
def test_resolve_nodata_attr_still_accepts_numeric_fillvalue():
76+
assert _resolve_nodata_attr({'_FillValue': -9999}) == -9999
77+
78+
79+
def test_resolve_nodata_attr_returns_none_for_nan_fillvalue():
80+
assert _resolve_nodata_attr({'_FillValue': float('nan')}) is None
81+
82+
83+
def test_to_geotiff_rejects_non_numeric_nodata_kwarg():
84+
buf = io.BytesIO()
85+
with pytest.raises(ValueError, match="nodata must be numeric"):
86+
to_geotiff(_nan_square(), buf, nodata='missing')
87+
88+
89+
def test_to_geotiff_rejects_non_numeric_fillvalue_attr():
90+
da = _nan_square()
91+
da.attrs['_FillValue'] = 'missing'
92+
buf = io.BytesIO()
93+
with pytest.raises(ValueError, match="_FillValue"):
94+
to_geotiff(da, buf)
95+
96+
97+
def test_to_geotiff_vrt_path_rejects_non_numeric_nodata(tmp_path):
98+
vrt_path = str(tmp_path / "tmp_1973_vrt.vrt")
99+
with pytest.raises(ValueError, match="nodata must be numeric"):
100+
to_geotiff(_nan_square(), vrt_path, nodata='missing')
101+
102+
103+
def test_to_geotiff_accepts_numeric_nodata_kwarg():
104+
buf = io.BytesIO()
105+
to_geotiff(_nan_square(), buf, nodata=-9999)
106+
assert buf.getbuffer().nbytes > 0
107+
108+
109+
# ---------------------------------------------------------------------------
110+
# Bool rejection: ``nodata=True`` / ``nodata=False`` must raise TypeError at
111+
# all three writer entry points (eager, GPU, VRT). The eager path already
112+
# rejected bools for #1911 but the GPU/VRT validators previously routed bool
113+
# through ``float(True) == 1.0`` and silently coerced. The shared validator
114+
# now refuses bools so all three paths behave the same.
115+
# ---------------------------------------------------------------------------
116+
117+
118+
@pytest.mark.parametrize("bad", [True, False])
119+
def test_validate_nodata_arg_rejects_bool(bad):
120+
with pytest.raises(TypeError, match="nodata must be numeric"):
121+
_validate_nodata_arg(bad)
122+
123+
124+
def test_validate_nodata_arg_rejects_numpy_bool():
125+
with pytest.raises(TypeError, match="nodata must be numeric"):
126+
_validate_nodata_arg(np.bool_(True))
127+
128+
129+
def test_to_geotiff_eager_rejects_bool_nodata():
130+
buf = io.BytesIO()
131+
with pytest.raises(TypeError, match="nodata must be numeric"):
132+
to_geotiff(_nan_square(), buf, nodata=True)
133+
134+
135+
def test_to_geotiff_vrt_rejects_bool_nodata(tmp_path):
136+
vrt_path = str(tmp_path / "tmp_1973_bool_vrt.vrt")
137+
with pytest.raises(TypeError, match="nodata must be numeric"):
138+
to_geotiff(_nan_square(), vrt_path, nodata=True)
139+
140+
141+
@_gpu_only
142+
def test_write_geotiff_gpu_rejects_bool_nodata(tmp_path):
143+
import cupy
144+
145+
from xrspatial.geotiff import write_geotiff_gpu
146+
147+
da_cpu = _nan_square()
148+
da_gpu = da_cpu.copy(data=cupy.asarray(da_cpu.values))
149+
out = str(tmp_path / "tmp_1973_bool_gpu.tif")
150+
with pytest.raises(TypeError, match="nodata must be numeric"):
151+
write_geotiff_gpu(da_gpu, out, nodata=True)
152+
153+
154+
# ---------------------------------------------------------------------------
155+
# All-non-numeric ``attrs['nodatavals']``: warn but still return None and
156+
# fall through. A tuple where every entry is non-numeric is almost certainly
157+
# a user error rather than a legitimate "no sentinel" signal.
158+
# ---------------------------------------------------------------------------
159+
160+
161+
def test_resolve_nodata_attr_warns_when_nodatavals_all_non_numeric():
162+
with pytest.warns(UserWarning, match="nodatavals"):
163+
result = _resolve_nodata_attr({'nodatavals': ('foo', 'bar')})
164+
assert result is None
165+
166+
167+
def test_resolve_nodata_attr_no_warning_when_nodatavals_has_usable_entry():
168+
# First entry is non-numeric, second is a real sentinel. The loop
169+
# returns -9999.0 before reaching the warn site, so no warning fires.
170+
import warnings as _warnings
171+
with _warnings.catch_warnings():
172+
_warnings.simplefilter("error")
173+
assert _resolve_nodata_attr({'nodatavals': ('foo', -9999.0)}) == -9999.0
174+
175+
176+
def test_resolve_nodata_attr_no_warning_when_nodatavals_all_nan():
177+
# NaN entries are skipped (they signal "the float NaN is the sentinel",
178+
# which doesn't need a GDAL_NODATA tag) but they ARE numeric, so the
179+
# all-non-numeric warning must not fire for an all-NaN tuple.
180+
import warnings as _warnings
181+
with _warnings.catch_warnings():
182+
_warnings.simplefilter("error")
183+
assert _resolve_nodata_attr({'nodatavals': (float('nan'),)}) is None

0 commit comments

Comments
 (0)