Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
172 changes: 6 additions & 166 deletions xrspatial/geotiff/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,15 @@
_GPU_DEPRECATED_SENTINEL,
_MISSING_SOURCES_SENTINEL,
_ON_GPU_FAILURE_SENTINEL,
_X_DIM_NAMES,
_Y_DIM_NAMES,
_geotiff_strict_mode,
_gpu_fallback_warning_message,
)
from ._validation import (
_validate_3d_writer_dims,
_validate_chunks_arg,
_validate_dtype_cast,
_validate_tile_size_arg,
)
from ._writer import write

# All names below are part of the supported public API. ``plot_geotiff``
Expand All @@ -91,68 +95,6 @@
]


def _validate_3d_writer_dims(dims) -> None:
"""Reject ambiguous 3D writer inputs (issue #1812).

The writer interprets a 3D DataArray as either ``(band, y, x)`` or
``(y, x, band)``. ``data.dims[0] in _BAND_DIM_NAMES`` decides which
branch fires the ``moveaxis``. Anything else (e.g. ``('time', 'y', 'x')``)
used to fall through silently: the writer kept the leading axis as
the spatial ``y`` axis and the result was a TIFF with the leading
axis values laid out along ``y`` (silent data corruption -- on
read-back the array round-tripped with a swapped shape).

Refuse the ambiguous case at the entry point. The message tells the
caller exactly how to fix the input (rename to one of
``_BAND_DIM_NAMES`` or transpose to ``(y, x, band)``).
"""
if len(dims) != 3:
return
d0, d1, d2 = dims
band_layout = (d0 in _BAND_DIM_NAMES
and d1 in _Y_DIM_NAMES
and d2 in _X_DIM_NAMES)
yxb_layout = (d0 in _Y_DIM_NAMES
and d1 in _X_DIM_NAMES
and d2 in _BAND_DIM_NAMES)
if band_layout or yxb_layout:
return
# Bare (y, x, *) or (*, y, x) where the third dim is unnamed but
# spatial -- the writer's old behaviour treats the non-spatial axis
# as bands. Accept that only when the unknown dim is in the band
# position (last), which matches how raw numpy callers typically
# build a band-last array.
if d0 in _Y_DIM_NAMES and d1 in _X_DIM_NAMES:
return
raise ValueError(
f"3D writer input has ambiguous dims {dims!r}. Expected "
f"(band, y, x) or (y, x, band); accepted band-dim aliases are "
f"{_BAND_DIM_NAMES} and spatial aliases are y={_Y_DIM_NAMES} / "
f"x={_X_DIM_NAMES}. Rename the non-spatial dim to 'band' or "
f"transpose the array so spatial dims come first (e.g. "
f"``da.transpose('y', 'x', {dims[0]!r})``). The writer cannot "
f"infer which axis is the band axis from arbitrary dim names "
f"and would otherwise silently treat the leading axis as the "
f"spatial y axis (issue #1812)."
)


def _validate_dtype_cast(source_dtype, target_dtype):
"""Validate that casting source_dtype to target_dtype is allowed.

Raises ValueError for float-to-int casts (lossy in a way users
often don't intend). All other casts are permitted -- the user
asked for them explicitly.
"""
src = np.dtype(source_dtype)
tgt = np.dtype(target_dtype)
if src.kind == 'f' and tgt.kind in ('u', 'i'):
raise ValueError(
f"Cannot cast float ({src}) to int ({tgt}). "
f"This loses fractional data and is usually unintentional. "
f"Cast explicitly after reading if you really want this.")


def _read_geo_info(source, *, overview_level: int | None = None):
"""Read only the geographic metadata and image dimensions from a GeoTIFF.

Expand Down Expand Up @@ -926,44 +868,6 @@ def _extract_rich_tags(attrs: dict) -> dict:
}


def _validate_tile_size(tile_size) -> None:
"""Validate ``tile_size`` for the tiled GeoTIFF writers.

Shared by ``to_geotiff`` (when ``tiled=True``) and
``write_geotiff_gpu`` (always tiled) so the accepted types, the
non-positive rejection, and the multiple-of-16 hint stay in lockstep.
The tiled writer computes the tile grid as
``math.ceil(width / tile_size)``; ``tile_size=0`` hits
``ZeroDivisionError`` deep inside the writer, and negative values
produce a nonsensical tile grid. The TIFF 6 spec also requires
``TileWidth`` and ``TileLength`` to be positive multiples of 16
for broad interoperability with libtiff / GDAL strict readers; a
value like 17 would otherwise round-trip through the in-repo
reader but be rejected elsewhere.
"""
if not isinstance(tile_size, (int, np.integer)) or isinstance(
tile_size, bool):
raise ValueError(
f"tile_size must be a positive int, got "
f"{tile_size!r} (type {type(tile_size).__name__}).")
if tile_size <= 0:
raise ValueError(
f"tile_size must be a positive int, got tile_size={tile_size}.")
if tile_size % 16 != 0:
lower = (int(tile_size) // 16) * 16
upper = lower + 16
# ``lower`` is 0 for tile_size < 16; suppress it from the hint
# because 0 is not a valid tile size on its own.
if lower <= 0:
hint = f"try tile_size={upper}"
else:
hint = f"try tile_size={lower} or tile_size={upper}"
raise ValueError(
f"tile_size must be a positive multiple of 16 (TIFF 6 "
f"spec requirement for TileWidth/TileLength), got "
f"tile_size={tile_size}; {hint}.")


def to_geotiff(data: xr.DataArray | np.ndarray,
path: str | BinaryIO, *,
crs: int | str | None = None,
Expand Down Expand Up @@ -1789,70 +1693,6 @@ def _write_vrt_tiled(data, vrt_path, *, crs=None, nodata=None,
_write_vrt_fn(vrt_path, tile_paths, relative=True, nodata=nodata)


def _validate_chunks_arg(chunks, *, allow_none=False):
"""Validate the ``chunks`` kwarg shared across the dask read entry points.

Centralises the rejection rule that ``read_geotiff_dask`` already
runs so ``read_geotiff_gpu`` and ``read_vrt`` can share the same
error format. With ``allow_none=True`` a ``None`` value passes
through unchanged (used by entry points whose default is
``chunks=None``, e.g. ``read_geotiff_gpu`` and ``read_vrt``).
With ``allow_none=False`` (default, matches ``read_geotiff_dask``)
a ``None`` is rejected with the same ``ValueError`` format as any
other non-int / non-tuple value, so callers see a clear
parameter-named error instead of a downstream ``TypeError`` from
the chunk-unpacking math.
Otherwise ``chunks`` must be a positive int or a 2-tuple of
positive ints. Booleans are rejected because ``True``/``False``
are int subclasses that would otherwise sneak through the integer
check. Returns the coerced int when given an ``np.integer`` scalar
so downstream ``isinstance(chunks, int)`` checks stay accurate.

Mirrors the chunks-validation #1752 added to ``read_geotiff_dask``;
extends it to the GPU read and VRT read entry points per #1776.
"""
if chunks is None:
if allow_none:
return chunks
raise ValueError(
f"chunks must be a positive int or (row, col) tuple of "
f"positive ints, got chunks=None.")
if (isinstance(chunks, (int, np.integer))
and not isinstance(chunks, bool)):
if chunks <= 0:
raise ValueError(
f"chunks must be a positive int or (row, col) tuple of "
f"positive ints, got chunks={chunks}.")
return int(chunks)
if isinstance(chunks, tuple):
if len(chunks) != 2:
raise ValueError(
f"chunks tuple must have length 2 (row, col), got "
f"chunks={chunks!r} with length {len(chunks)}.")
for _v in chunks:
if (not isinstance(_v, (int, np.integer))
or isinstance(_v, bool)
or _v <= 0):
raise ValueError(
f"chunks must be a positive int or (row, col) tuple "
f"of positive ints, got chunks={chunks!r}.")
return chunks
raise ValueError(
f"chunks must be a positive int or (row, col) tuple of "
f"positive ints, got chunks={chunks!r} "
f"(type {type(chunks).__name__}).")


def _validate_tile_size_arg(tile_size):
"""Validate the ``tile_size`` kwarg for the tiled writer entry points.

Wrapper kept for backwards internal compatibility; delegates to
``_validate_tile_size`` so to_geotiff/write_geotiff_gpu share one
validation path (positive int + multiple-of-16 for tiled output).
"""
_validate_tile_size(tile_size)


def read_geotiff_dask(source: str, *,
dtype: str | np.dtype | None = None,
chunks: int | tuple = 512,
Expand Down
181 changes: 181 additions & 0 deletions xrspatial/geotiff/_validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
"""Input validators shared by the geotiff entry points.

Pure leaves over numpy dtypes and Python primitives. Called from
``to_geotiff``, ``read_geotiff_dask``, ``read_geotiff_gpu``,
``read_vrt``, and ``write_geotiff_gpu`` so the rejection rules
(non-positive chunks, lossy float-to-int casts, ambiguous 3D dim
layouts, tile-size multiples of 16, etc.) stay in lockstep across
every backend.

Extracted in step 4 of issue #1813.
"""
from __future__ import annotations

import numpy as np

from ._coords import _BAND_DIM_NAMES
from ._runtime import _X_DIM_NAMES, _Y_DIM_NAMES


def _validate_3d_writer_dims(dims) -> None:
"""Reject ambiguous 3D writer inputs (issue #1812).

The writer interprets a 3D DataArray as either ``(band, y, x)`` or
``(y, x, band)``. ``data.dims[0] in _BAND_DIM_NAMES`` decides which
branch fires the ``moveaxis``. Anything else (e.g. ``('time', 'y', 'x')``)
used to fall through silently: the writer kept the leading axis as
the spatial ``y`` axis and the result was a TIFF with the leading
axis values laid out along ``y`` (silent data corruption -- on
read-back the array round-tripped with a swapped shape).

Refuse the ambiguous case at the entry point. The message tells the
caller exactly how to fix the input (rename to one of
``_BAND_DIM_NAMES`` or transpose to ``(y, x, band)``).
"""
if len(dims) != 3:
return
d0, d1, d2 = dims
band_layout = (d0 in _BAND_DIM_NAMES
and d1 in _Y_DIM_NAMES
and d2 in _X_DIM_NAMES)
yxb_layout = (d0 in _Y_DIM_NAMES
and d1 in _X_DIM_NAMES
and d2 in _BAND_DIM_NAMES)
if band_layout or yxb_layout:
return
# Bare (y, x, *) or (*, y, x) where the third dim is unnamed but
# spatial -- the writer's old behaviour treats the non-spatial axis
# as bands. Accept that only when the unknown dim is in the band
# position (last), which matches how raw numpy callers typically
# build a band-last array.
if d0 in _Y_DIM_NAMES and d1 in _X_DIM_NAMES:
return
raise ValueError(
f"3D writer input has ambiguous dims {dims!r}. Expected "
f"(band, y, x) or (y, x, band); accepted band-dim aliases are "
f"{_BAND_DIM_NAMES} and spatial aliases are y={_Y_DIM_NAMES} / "
f"x={_X_DIM_NAMES}. Rename the non-spatial dim to 'band' or "
f"transpose the array so spatial dims come first (e.g. "
f"``da.transpose('y', 'x', {dims[0]!r})``). The writer cannot "
f"infer which axis is the band axis from arbitrary dim names "
f"and would otherwise silently treat the leading axis as the "
f"spatial y axis (issue #1812)."
)


def _validate_dtype_cast(source_dtype, target_dtype):
"""Validate that casting source_dtype to target_dtype is allowed.

Raises ValueError for float-to-int casts (lossy in a way users
often don't intend). All other casts are permitted -- the user
asked for them explicitly.
"""
src = np.dtype(source_dtype)
tgt = np.dtype(target_dtype)
if src.kind == 'f' and tgt.kind in ('u', 'i'):
raise ValueError(
f"Cannot cast float ({src}) to int ({tgt}). "
f"This loses fractional data and is usually unintentional. "
f"Cast explicitly after reading if you really want this.")


def _validate_tile_size(tile_size) -> None:
"""Validate ``tile_size`` for the tiled GeoTIFF writers.

Shared by ``to_geotiff`` (when ``tiled=True``) and
``write_geotiff_gpu`` (always tiled) so the accepted types, the
non-positive rejection, and the multiple-of-16 hint stay in lockstep.
The tiled writer computes the tile grid as
``math.ceil(width / tile_size)``; ``tile_size=0`` hits
``ZeroDivisionError`` deep inside the writer, and negative values
produce a nonsensical tile grid. The TIFF 6 spec also requires
``TileWidth`` and ``TileLength`` to be positive multiples of 16
for broad interoperability with libtiff / GDAL strict readers; a
value like 17 would otherwise round-trip through the in-repo
reader but be rejected elsewhere.
"""
if not isinstance(tile_size, (int, np.integer)) or isinstance(
tile_size, bool):
raise ValueError(
f"tile_size must be a positive int, got "
f"{tile_size!r} (type {type(tile_size).__name__}).")
if tile_size <= 0:
raise ValueError(
f"tile_size must be a positive int, got tile_size={tile_size}.")
if tile_size % 16 != 0:
lower = (int(tile_size) // 16) * 16
upper = lower + 16
# ``lower`` is 0 for tile_size < 16; suppress it from the hint
# because 0 is not a valid tile size on its own.
if lower <= 0:
hint = f"try tile_size={upper}"
else:
hint = f"try tile_size={lower} or tile_size={upper}"
raise ValueError(
f"tile_size must be a positive multiple of 16 (TIFF 6 "
f"spec requirement for TileWidth/TileLength), got "
f"tile_size={tile_size}; {hint}.")


def _validate_chunks_arg(chunks, *, allow_none=False):
"""Validate the ``chunks`` kwarg shared across the dask read entry points.

Centralises the rejection rule that ``read_geotiff_dask`` already
runs so ``read_geotiff_gpu`` and ``read_vrt`` can share the same
error format. With ``allow_none=True`` a ``None`` value passes
through unchanged (used by entry points whose default is
``chunks=None``, e.g. ``read_geotiff_gpu`` and ``read_vrt``).
With ``allow_none=False`` (default, matches ``read_geotiff_dask``)
a ``None`` is rejected with the same ``ValueError`` format as any
other non-int / non-tuple value, so callers see a clear
parameter-named error instead of a downstream ``TypeError`` from
the chunk-unpacking math.
Otherwise ``chunks`` must be a positive int or a 2-tuple of
positive ints. Booleans are rejected because ``True``/``False``
are int subclasses that would otherwise sneak through the integer
check. Returns the coerced int when given an ``np.integer`` scalar
so downstream ``isinstance(chunks, int)`` checks stay accurate.

Mirrors the chunks-validation #1752 added to ``read_geotiff_dask``;
extends it to the GPU read and VRT read entry points per #1776.
"""
if chunks is None:
if allow_none:
return chunks
raise ValueError(
f"chunks must be a positive int or (row, col) tuple of "
f"positive ints, got chunks=None.")
if (isinstance(chunks, (int, np.integer))
and not isinstance(chunks, bool)):
if chunks <= 0:
raise ValueError(
f"chunks must be a positive int or (row, col) tuple of "
f"positive ints, got chunks={chunks}.")
return int(chunks)
if isinstance(chunks, tuple):
if len(chunks) != 2:
raise ValueError(
f"chunks tuple must have length 2 (row, col), got "
f"chunks={chunks!r} with length {len(chunks)}.")
for _v in chunks:
if (not isinstance(_v, (int, np.integer))
or isinstance(_v, bool)
or _v <= 0):
raise ValueError(
f"chunks must be a positive int or (row, col) tuple "
f"of positive ints, got chunks={chunks!r}.")
return chunks
raise ValueError(
f"chunks must be a positive int or (row, col) tuple of "
f"positive ints, got chunks={chunks!r} "
f"(type {type(chunks).__name__}).")


def _validate_tile_size_arg(tile_size):
"""Validate the ``tile_size`` kwarg for the tiled writer entry points.

Wrapper kept for backwards internal compatibility; delegates to
``_validate_tile_size`` so to_geotiff/write_geotiff_gpu share one
validation path (positive int + multiple-of-16 for tiled output).
"""
_validate_tile_size(tile_size)
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,6 @@ def test_fallback_warning_class_is_singleton():
assert geotiff_pkg.GeoTIFFFallbackWarning is _runtime.GeoTIFFFallbackWarning


def test_dim_name_tuples_are_singleton():
assert geotiff_pkg._Y_DIM_NAMES is _runtime._Y_DIM_NAMES
assert geotiff_pkg._X_DIM_NAMES is _runtime._X_DIM_NAMES


def test_strict_mode_helper_is_singleton():
assert geotiff_pkg._geotiff_strict_mode is _runtime._geotiff_strict_mode

Expand Down
Loading