diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index 07649781..40e6608f 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -68,11 +68,15 @@ _GPU_DEPRECATED_SENTINEL, _MISSING_SOURCES_SENTINEL, _ON_GPU_FAILURE_SENTINEL, - _X_DIM_NAMES, - _Y_DIM_NAMES, _geotiff_strict_mode, _gpu_fallback_warning_message, ) +from ._validation import ( + _validate_3d_writer_dims, + _validate_chunks_arg, + _validate_dtype_cast, + _validate_tile_size_arg, +) from ._writer import write # All names below are part of the supported public API. ``plot_geotiff`` @@ -91,68 +95,6 @@ ] -def _validate_3d_writer_dims(dims) -> None: - """Reject ambiguous 3D writer inputs (issue #1812). - - The writer interprets a 3D DataArray as either ``(band, y, x)`` or - ``(y, x, band)``. ``data.dims[0] in _BAND_DIM_NAMES`` decides which - branch fires the ``moveaxis``. Anything else (e.g. ``('time', 'y', 'x')``) - used to fall through silently: the writer kept the leading axis as - the spatial ``y`` axis and the result was a TIFF with the leading - axis values laid out along ``y`` (silent data corruption -- on - read-back the array round-tripped with a swapped shape). - - Refuse the ambiguous case at the entry point. The message tells the - caller exactly how to fix the input (rename to one of - ``_BAND_DIM_NAMES`` or transpose to ``(y, x, band)``). - """ - if len(dims) != 3: - return - d0, d1, d2 = dims - band_layout = (d0 in _BAND_DIM_NAMES - and d1 in _Y_DIM_NAMES - and d2 in _X_DIM_NAMES) - yxb_layout = (d0 in _Y_DIM_NAMES - and d1 in _X_DIM_NAMES - and d2 in _BAND_DIM_NAMES) - if band_layout or yxb_layout: - return - # Bare (y, x, *) or (*, y, x) where the third dim is unnamed but - # spatial -- the writer's old behaviour treats the non-spatial axis - # as bands. Accept that only when the unknown dim is in the band - # position (last), which matches how raw numpy callers typically - # build a band-last array. - if d0 in _Y_DIM_NAMES and d1 in _X_DIM_NAMES: - return - raise ValueError( - f"3D writer input has ambiguous dims {dims!r}. Expected " - f"(band, y, x) or (y, x, band); accepted band-dim aliases are " - f"{_BAND_DIM_NAMES} and spatial aliases are y={_Y_DIM_NAMES} / " - f"x={_X_DIM_NAMES}. Rename the non-spatial dim to 'band' or " - f"transpose the array so spatial dims come first (e.g. " - f"``da.transpose('y', 'x', {dims[0]!r})``). The writer cannot " - f"infer which axis is the band axis from arbitrary dim names " - f"and would otherwise silently treat the leading axis as the " - f"spatial y axis (issue #1812)." - ) - - -def _validate_dtype_cast(source_dtype, target_dtype): - """Validate that casting source_dtype to target_dtype is allowed. - - Raises ValueError for float-to-int casts (lossy in a way users - often don't intend). All other casts are permitted -- the user - asked for them explicitly. - """ - src = np.dtype(source_dtype) - tgt = np.dtype(target_dtype) - if src.kind == 'f' and tgt.kind in ('u', 'i'): - raise ValueError( - f"Cannot cast float ({src}) to int ({tgt}). " - f"This loses fractional data and is usually unintentional. " - f"Cast explicitly after reading if you really want this.") - - def _read_geo_info(source, *, overview_level: int | None = None): """Read only the geographic metadata and image dimensions from a GeoTIFF. @@ -926,44 +868,6 @@ def _extract_rich_tags(attrs: dict) -> dict: } -def _validate_tile_size(tile_size) -> None: - """Validate ``tile_size`` for the tiled GeoTIFF writers. - - Shared by ``to_geotiff`` (when ``tiled=True``) and - ``write_geotiff_gpu`` (always tiled) so the accepted types, the - non-positive rejection, and the multiple-of-16 hint stay in lockstep. - The tiled writer computes the tile grid as - ``math.ceil(width / tile_size)``; ``tile_size=0`` hits - ``ZeroDivisionError`` deep inside the writer, and negative values - produce a nonsensical tile grid. The TIFF 6 spec also requires - ``TileWidth`` and ``TileLength`` to be positive multiples of 16 - for broad interoperability with libtiff / GDAL strict readers; a - value like 17 would otherwise round-trip through the in-repo - reader but be rejected elsewhere. - """ - if not isinstance(tile_size, (int, np.integer)) or isinstance( - tile_size, bool): - raise ValueError( - f"tile_size must be a positive int, got " - f"{tile_size!r} (type {type(tile_size).__name__}).") - if tile_size <= 0: - raise ValueError( - f"tile_size must be a positive int, got tile_size={tile_size}.") - if tile_size % 16 != 0: - lower = (int(tile_size) // 16) * 16 - upper = lower + 16 - # ``lower`` is 0 for tile_size < 16; suppress it from the hint - # because 0 is not a valid tile size on its own. - if lower <= 0: - hint = f"try tile_size={upper}" - else: - hint = f"try tile_size={lower} or tile_size={upper}" - raise ValueError( - f"tile_size must be a positive multiple of 16 (TIFF 6 " - f"spec requirement for TileWidth/TileLength), got " - f"tile_size={tile_size}; {hint}.") - - def to_geotiff(data: xr.DataArray | np.ndarray, path: str | BinaryIO, *, crs: int | str | None = None, @@ -1789,70 +1693,6 @@ def _write_vrt_tiled(data, vrt_path, *, crs=None, nodata=None, _write_vrt_fn(vrt_path, tile_paths, relative=True, nodata=nodata) -def _validate_chunks_arg(chunks, *, allow_none=False): - """Validate the ``chunks`` kwarg shared across the dask read entry points. - - Centralises the rejection rule that ``read_geotiff_dask`` already - runs so ``read_geotiff_gpu`` and ``read_vrt`` can share the same - error format. With ``allow_none=True`` a ``None`` value passes - through unchanged (used by entry points whose default is - ``chunks=None``, e.g. ``read_geotiff_gpu`` and ``read_vrt``). - With ``allow_none=False`` (default, matches ``read_geotiff_dask``) - a ``None`` is rejected with the same ``ValueError`` format as any - other non-int / non-tuple value, so callers see a clear - parameter-named error instead of a downstream ``TypeError`` from - the chunk-unpacking math. - Otherwise ``chunks`` must be a positive int or a 2-tuple of - positive ints. Booleans are rejected because ``True``/``False`` - are int subclasses that would otherwise sneak through the integer - check. Returns the coerced int when given an ``np.integer`` scalar - so downstream ``isinstance(chunks, int)`` checks stay accurate. - - Mirrors the chunks-validation #1752 added to ``read_geotiff_dask``; - extends it to the GPU read and VRT read entry points per #1776. - """ - if chunks is None: - if allow_none: - return chunks - raise ValueError( - f"chunks must be a positive int or (row, col) tuple of " - f"positive ints, got chunks=None.") - if (isinstance(chunks, (int, np.integer)) - and not isinstance(chunks, bool)): - if chunks <= 0: - raise ValueError( - f"chunks must be a positive int or (row, col) tuple of " - f"positive ints, got chunks={chunks}.") - return int(chunks) - if isinstance(chunks, tuple): - if len(chunks) != 2: - raise ValueError( - f"chunks tuple must have length 2 (row, col), got " - f"chunks={chunks!r} with length {len(chunks)}.") - for _v in chunks: - if (not isinstance(_v, (int, np.integer)) - or isinstance(_v, bool) - or _v <= 0): - raise ValueError( - f"chunks must be a positive int or (row, col) tuple " - f"of positive ints, got chunks={chunks!r}.") - return chunks - raise ValueError( - f"chunks must be a positive int or (row, col) tuple of " - f"positive ints, got chunks={chunks!r} " - f"(type {type(chunks).__name__}).") - - -def _validate_tile_size_arg(tile_size): - """Validate the ``tile_size`` kwarg for the tiled writer entry points. - - Wrapper kept for backwards internal compatibility; delegates to - ``_validate_tile_size`` so to_geotiff/write_geotiff_gpu share one - validation path (positive int + multiple-of-16 for tiled output). - """ - _validate_tile_size(tile_size) - - def read_geotiff_dask(source: str, *, dtype: str | np.dtype | None = None, chunks: int | tuple = 512, diff --git a/xrspatial/geotiff/_validation.py b/xrspatial/geotiff/_validation.py new file mode 100644 index 00000000..bde6773c --- /dev/null +++ b/xrspatial/geotiff/_validation.py @@ -0,0 +1,181 @@ +"""Input validators shared by the geotiff entry points. + +Pure leaves over numpy dtypes and Python primitives. Called from +``to_geotiff``, ``read_geotiff_dask``, ``read_geotiff_gpu``, +``read_vrt``, and ``write_geotiff_gpu`` so the rejection rules +(non-positive chunks, lossy float-to-int casts, ambiguous 3D dim +layouts, tile-size multiples of 16, etc.) stay in lockstep across +every backend. + +Extracted in step 4 of issue #1813. +""" +from __future__ import annotations + +import numpy as np + +from ._coords import _BAND_DIM_NAMES +from ._runtime import _X_DIM_NAMES, _Y_DIM_NAMES + + +def _validate_3d_writer_dims(dims) -> None: + """Reject ambiguous 3D writer inputs (issue #1812). + + The writer interprets a 3D DataArray as either ``(band, y, x)`` or + ``(y, x, band)``. ``data.dims[0] in _BAND_DIM_NAMES`` decides which + branch fires the ``moveaxis``. Anything else (e.g. ``('time', 'y', 'x')``) + used to fall through silently: the writer kept the leading axis as + the spatial ``y`` axis and the result was a TIFF with the leading + axis values laid out along ``y`` (silent data corruption -- on + read-back the array round-tripped with a swapped shape). + + Refuse the ambiguous case at the entry point. The message tells the + caller exactly how to fix the input (rename to one of + ``_BAND_DIM_NAMES`` or transpose to ``(y, x, band)``). + """ + if len(dims) != 3: + return + d0, d1, d2 = dims + band_layout = (d0 in _BAND_DIM_NAMES + and d1 in _Y_DIM_NAMES + and d2 in _X_DIM_NAMES) + yxb_layout = (d0 in _Y_DIM_NAMES + and d1 in _X_DIM_NAMES + and d2 in _BAND_DIM_NAMES) + if band_layout or yxb_layout: + return + # Bare (y, x, *) or (*, y, x) where the third dim is unnamed but + # spatial -- the writer's old behaviour treats the non-spatial axis + # as bands. Accept that only when the unknown dim is in the band + # position (last), which matches how raw numpy callers typically + # build a band-last array. + if d0 in _Y_DIM_NAMES and d1 in _X_DIM_NAMES: + return + raise ValueError( + f"3D writer input has ambiguous dims {dims!r}. Expected " + f"(band, y, x) or (y, x, band); accepted band-dim aliases are " + f"{_BAND_DIM_NAMES} and spatial aliases are y={_Y_DIM_NAMES} / " + f"x={_X_DIM_NAMES}. Rename the non-spatial dim to 'band' or " + f"transpose the array so spatial dims come first (e.g. " + f"``da.transpose('y', 'x', {dims[0]!r})``). The writer cannot " + f"infer which axis is the band axis from arbitrary dim names " + f"and would otherwise silently treat the leading axis as the " + f"spatial y axis (issue #1812)." + ) + + +def _validate_dtype_cast(source_dtype, target_dtype): + """Validate that casting source_dtype to target_dtype is allowed. + + Raises ValueError for float-to-int casts (lossy in a way users + often don't intend). All other casts are permitted -- the user + asked for them explicitly. + """ + src = np.dtype(source_dtype) + tgt = np.dtype(target_dtype) + if src.kind == 'f' and tgt.kind in ('u', 'i'): + raise ValueError( + f"Cannot cast float ({src}) to int ({tgt}). " + f"This loses fractional data and is usually unintentional. " + f"Cast explicitly after reading if you really want this.") + + +def _validate_tile_size(tile_size) -> None: + """Validate ``tile_size`` for the tiled GeoTIFF writers. + + Shared by ``to_geotiff`` (when ``tiled=True``) and + ``write_geotiff_gpu`` (always tiled) so the accepted types, the + non-positive rejection, and the multiple-of-16 hint stay in lockstep. + The tiled writer computes the tile grid as + ``math.ceil(width / tile_size)``; ``tile_size=0`` hits + ``ZeroDivisionError`` deep inside the writer, and negative values + produce a nonsensical tile grid. The TIFF 6 spec also requires + ``TileWidth`` and ``TileLength`` to be positive multiples of 16 + for broad interoperability with libtiff / GDAL strict readers; a + value like 17 would otherwise round-trip through the in-repo + reader but be rejected elsewhere. + """ + if not isinstance(tile_size, (int, np.integer)) or isinstance( + tile_size, bool): + raise ValueError( + f"tile_size must be a positive int, got " + f"{tile_size!r} (type {type(tile_size).__name__}).") + if tile_size <= 0: + raise ValueError( + f"tile_size must be a positive int, got tile_size={tile_size}.") + if tile_size % 16 != 0: + lower = (int(tile_size) // 16) * 16 + upper = lower + 16 + # ``lower`` is 0 for tile_size < 16; suppress it from the hint + # because 0 is not a valid tile size on its own. + if lower <= 0: + hint = f"try tile_size={upper}" + else: + hint = f"try tile_size={lower} or tile_size={upper}" + raise ValueError( + f"tile_size must be a positive multiple of 16 (TIFF 6 " + f"spec requirement for TileWidth/TileLength), got " + f"tile_size={tile_size}; {hint}.") + + +def _validate_chunks_arg(chunks, *, allow_none=False): + """Validate the ``chunks`` kwarg shared across the dask read entry points. + + Centralises the rejection rule that ``read_geotiff_dask`` already + runs so ``read_geotiff_gpu`` and ``read_vrt`` can share the same + error format. With ``allow_none=True`` a ``None`` value passes + through unchanged (used by entry points whose default is + ``chunks=None``, e.g. ``read_geotiff_gpu`` and ``read_vrt``). + With ``allow_none=False`` (default, matches ``read_geotiff_dask``) + a ``None`` is rejected with the same ``ValueError`` format as any + other non-int / non-tuple value, so callers see a clear + parameter-named error instead of a downstream ``TypeError`` from + the chunk-unpacking math. + Otherwise ``chunks`` must be a positive int or a 2-tuple of + positive ints. Booleans are rejected because ``True``/``False`` + are int subclasses that would otherwise sneak through the integer + check. Returns the coerced int when given an ``np.integer`` scalar + so downstream ``isinstance(chunks, int)`` checks stay accurate. + + Mirrors the chunks-validation #1752 added to ``read_geotiff_dask``; + extends it to the GPU read and VRT read entry points per #1776. + """ + if chunks is None: + if allow_none: + return chunks + raise ValueError( + f"chunks must be a positive int or (row, col) tuple of " + f"positive ints, got chunks=None.") + if (isinstance(chunks, (int, np.integer)) + and not isinstance(chunks, bool)): + if chunks <= 0: + raise ValueError( + f"chunks must be a positive int or (row, col) tuple of " + f"positive ints, got chunks={chunks}.") + return int(chunks) + if isinstance(chunks, tuple): + if len(chunks) != 2: + raise ValueError( + f"chunks tuple must have length 2 (row, col), got " + f"chunks={chunks!r} with length {len(chunks)}.") + for _v in chunks: + if (not isinstance(_v, (int, np.integer)) + or isinstance(_v, bool) + or _v <= 0): + raise ValueError( + f"chunks must be a positive int or (row, col) tuple " + f"of positive ints, got chunks={chunks!r}.") + return chunks + raise ValueError( + f"chunks must be a positive int or (row, col) tuple of " + f"positive ints, got chunks={chunks!r} " + f"(type {type(chunks).__name__}).") + + +def _validate_tile_size_arg(tile_size): + """Validate the ``tile_size`` kwarg for the tiled writer entry points. + + Wrapper kept for backwards internal compatibility; delegates to + ``_validate_tile_size`` so to_geotiff/write_geotiff_gpu share one + validation path (positive int + multiple-of-16 for tiled output). + """ + _validate_tile_size(tile_size) diff --git a/xrspatial/geotiff/tests/test_runtime_sentinels_identity_1880.py b/xrspatial/geotiff/tests/test_runtime_sentinels_identity_1880.py index b1af6796..6491502e 100644 --- a/xrspatial/geotiff/tests/test_runtime_sentinels_identity_1880.py +++ b/xrspatial/geotiff/tests/test_runtime_sentinels_identity_1880.py @@ -50,11 +50,6 @@ def test_fallback_warning_class_is_singleton(): assert geotiff_pkg.GeoTIFFFallbackWarning is _runtime.GeoTIFFFallbackWarning -def test_dim_name_tuples_are_singleton(): - assert geotiff_pkg._Y_DIM_NAMES is _runtime._Y_DIM_NAMES - assert geotiff_pkg._X_DIM_NAMES is _runtime._X_DIM_NAMES - - def test_strict_mode_helper_is_singleton(): assert geotiff_pkg._geotiff_strict_mode is _runtime._geotiff_strict_mode