Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
### Unreleased

#### Bug fixes and improvements
- Resolve the GeoTIFF writer's `GeographicTypeGeoKey` / `ProjectedCSTypeGeoKey` decision via pyproj instead of an EPSG number range. The legacy heuristic (4326 + 4000-4999 -> geographic, else projected) silently mis-tagged geographic CRSes registered outside the 4000-4999 block (NAD83(2011) = 6318, GDA2020 = 7844, WGS 84 (G2139) = 9057, etc.) as projected and projected codes inside the block (4087 / 4088 / 4499) as geographic, corrupting the CRS at write time. The writer now calls `pyproj.CRS.from_epsg(...).is_geographic`. When pyproj can't classify a code (uninstalled, or installed but the local PROJ database lacks the entry), the writer raises the new `UnknownCRSModelTypeError` rather than guessing -- a small vetted allowlist (4326, 4269, 4267, 4258, 4283, 4322, 4230, 4019, 4047) is still honoured for the pyproj-missing case. `pyproj` is now listed under the `geotiff` extra. (#2277)
- Shut down the per-tile compression `ThreadPoolExecutor` on every exit path of the streaming tiled-write code in `to_geotiff`. The old code only called `shutdown(wait=True)` after the tile-row loop completed, so any mid-stream raise (compression failure, dask compute failure, file write failure) bypassed shutdown and leaked worker threads. The loop now runs inside `try/finally` and the finally calls `shutdown(wait=True, cancel_futures=True)` so queued tiles get dropped on the error path instead of blocking the unwind. The pool's workers carry an `xrspatial-geotiff-tile-compress` `thread_name_prefix` so leak-detection tests can tell them apart from dask's own offload/scheduler pools. (#2276)
- Remove read-side emission of the 13 deprecated GeoTIFF attrs (`crs_name`, `geog_citation`, `datum_code`, `angular_units`, `semi_major_axis`, `inv_flattening`, `linear_units`, `projection_code`, `vertical_crs`, `vertical_citation`, `vertical_units`, `colormap_rgba`, `cmap`) and bump `attrs['_xrspatial_geotiff_contract']` from 1 to 2. Downstream code that read these via `attrs[key]` now sees `KeyError`; migrate to `attrs.get(key)` or derive the value from `attrs['crs']` / `attrs['crs_wkt']` with pyproj. The `.xrs.plot()` accessor still surfaces palette colormaps by building a `ListedColormap` from the canonical `attrs['colormap']`. (#2016)
- Accept numpy integer scalars as the `crs=` argument to `to_geotiff` / `write_geotiff_gpu`. The validator already allowed `numbers.Integral`, but the writers gated EPSG assignment on `isinstance(crs, int)`, so `np.int32` / `np.int64` / `np.uint16` values passed validation then silently fell through with no EPSG written. (#2082)
Expand Down
5 changes: 5 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,11 @@ reproject =
pyproj
geotiff =
deflate
# Required by the GeoTIFF writer to classify EPSG codes as
# geographic vs projected. Without pyproj the writer can only
# handle a tiny hard-coded allowlist of EPSGs and raises
# UnknownCRSModelTypeError on anything else. See issue #2277.
pyproj
dask =
dask[array]
dask-geopandas
Expand Down
2 changes: 2 additions & 0 deletions xrspatial/geotiff/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
MixedBandMetadataError,
NonUniformCoordsError,
RotatedTransformError,
UnknownCRSModelTypeError,
UnparseableCRSError,
)
from ._geotags import GeoTransform, RASTER_PIXEL_IS_AREA, RASTER_PIXEL_IS_POINT
Expand Down Expand Up @@ -140,6 +141,7 @@
'NonUniformCoordsError',
'RotatedTransformError',
'SUPPORTED_FEATURES',
'UnknownCRSModelTypeError',
'UnparseableCRSError',
'UnsafeURLError',
'open_geotiff',
Expand Down
16 changes: 16 additions & 0 deletions xrspatial/geotiff/_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,21 @@ class ConflictingNodataError(GeoTIFFAmbiguousMetadataError):
"""


class UnknownCRSModelTypeError(GeoTIFFAmbiguousMetadataError):
"""Can't classify an EPSG as geographic or projected on write (#2277).

Raised by the GeoTIFF writer when the caller supplies an EPSG code
that pyproj cannot resolve, or when pyproj isn't installed and the
code falls outside the hard-coded geographic fallback set
(EPSG 4326 plus the 4000-4999 block). The legacy heuristic at the
same site guessed any code outside that window was projected, which
silently mis-tagged geographic codes like 6318 (NAD83(2011)),
7844 (GDA2020), and 9057 (WGS 84 (G2139)) as
``ProjectedCSTypeGeoKey``. Silent CRS corruption is worse than an
explicit error.
"""


__all__ = [
"GeoTIFFAmbiguousMetadataError",
"InvalidCRSCodeError",
Expand All @@ -112,4 +127,5 @@ class ConflictingNodataError(GeoTIFFAmbiguousMetadataError):
"MixedBandMetadataError",
"ConflictingCRSError",
"ConflictingNodataError",
"UnknownCRSModelTypeError",
]
101 changes: 95 additions & 6 deletions xrspatial/geotiff/_geotags.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
TAG_GEO_KEY_DIRECTORY, TAG_GEO_DOUBLE_PARAMS, TAG_GEO_ASCII_PARAMS,
)
from ._dtypes import resolve_bits_per_sample
from ._errors import RotatedTransformError
from ._errors import RotatedTransformError, UnknownCRSModelTypeError

# ImageDescription tag (270). Captured for round-trip but not managed
# by the writer -- it flows through extra_tags pass-through.
Expand Down Expand Up @@ -1317,6 +1317,92 @@ def _model_type_from_wkt(wkt: str) -> int:
return MODEL_TYPE_PROJECTED


# Vetted allowlist of EPSG codes known to be geographic. Consulted in
# two situations: when pyproj is unavailable at all, and when pyproj is
# installed but its database can't resolve the code (e.g. out-of-date
# PROJ db). Anything outside this set falls through to
# UnknownCRSModelTypeError -- silent CRS corruption is worse than an
# explicit error.
#
# The set is kept intentionally small. The original range heuristic
# (4000-4999) was wrong in both directions: it missed geographic codes
# outside the window (NAD83(2011) = 6318, GDA2020 = 7844, etc., the bug
# this PR fixes), AND it also covered projected codes inside the window
# (4087 / 4088 World Equidistant Cylindrical, 4499 CGCS2000 / GK
# zone 21, etc.). Expanding the set without pyproj's authoritative
# answer just trades one mis-tag for another.
#
# Currently: the few datums callers regularly write without pyproj.
# Extend only after verifying ``CRS.from_epsg(code).is_geographic`` is
# True. See issue #2277.
_KNOWN_GEOGRAPHIC_EPSG_FALLBACK = frozenset({
4326, # WGS 84
4269, # NAD83
4267, # NAD27
4258, # ETRS89
4283, # GDA94
4322, # WGS 72
4230, # ED50
4019, # Unknown datum based on GRS 1980 ellipsoid
4047, # Unspecified datum based on GRS 1980 Authalic Sphere
})


def _model_type_from_epsg(crs_epsg: int) -> int:
"""Return the GeoTIFF ModelType (geographic vs projected) for an EPSG.

Prefers ``pyproj.CRS.from_epsg(crs_epsg).is_geographic`` so the
decision matches the actual EPSG registry. Falls back to a small
vetted allowlist of common geographic datums
(:data:`_KNOWN_GEOGRAPHIC_EPSG_FALLBACK`) when pyproj isn't
installed *or* when pyproj is installed but fails to resolve the
code. Anything outside that allowlist raises
:class:`UnknownCRSModelTypeError` rather than guessing -- the legacy
range heuristic at this site silently mis-tagged geographic codes
like 6318 (NAD83(2011)) and 7844 (GDA2020) as projected.

See issue #2277.
"""
try:
from pyproj import CRS
except ImportError:
CRS = None # type: ignore[assignment]

if CRS is not None:
try:
crs = CRS.from_epsg(crs_epsg)
except Exception as e:
# pyproj is installed but the code is unknown or malformed.
# Fall back to the hard-coded allowlist before giving up so
# an offline pyproj-database mismatch doesn't break a code
# that the legacy heuristic handled correctly.
if crs_epsg in _KNOWN_GEOGRAPHIC_EPSG_FALLBACK:
return MODEL_TYPE_GEOGRAPHIC
raise UnknownCRSModelTypeError(
f"Cannot determine GeoTIFF model type for EPSG:{crs_epsg}: "
f"pyproj.CRS.from_epsg failed "
f"({type(e).__name__}: {e}). Refusing to guess; passing a "
"known EPSG or installing a current pyproj database will "
"resolve this."
) from e
if crs.is_geographic:
return MODEL_TYPE_GEOGRAPHIC
return MODEL_TYPE_PROJECTED

# pyproj unavailable. Use the tight fallback set; raise otherwise.
if crs_epsg in _KNOWN_GEOGRAPHIC_EPSG_FALLBACK:
return MODEL_TYPE_GEOGRAPHIC
raise UnknownCRSModelTypeError(
f"Cannot determine GeoTIFF model type for EPSG:{crs_epsg} without "
"pyproj. Install pyproj so the writer can consult "
"CRS.from_epsg(...).is_geographic, or pass an EPSG in the "
"hard-coded geographic fallback set "
f"({sorted(_KNOWN_GEOGRAPHIC_EPSG_FALLBACK)}). Refusing to "
"guess: the legacy range heuristic silently mis-tagged codes "
"like 6318 and 7844 as projected. See issue #2277."
)


def build_geo_tags(transform: GeoTransform, crs_epsg: int | None = None,
nodata=None,
raster_type: int = RASTER_PIXEL_IS_AREA,
Expand Down Expand Up @@ -1402,11 +1488,14 @@ def build_geo_tags(transform: GeoTransform, crs_epsg: int | None = None,

# ModelType
if crs_epsg is not None:
# Guess model type from EPSG (simple heuristic)
if crs_epsg == 4326 or (crs_epsg >= 4000 and crs_epsg < 5000):
model_type = MODEL_TYPE_GEOGRAPHIC
else:
model_type = MODEL_TYPE_PROJECTED
# Resolve the GeoTIFF ModelType via pyproj when available; raise
# on unknown codes when pyproj can't classify them rather than
# guessing. See issue #2277 -- the historic EPSG-range heuristic
# silently mis-tagged geographic codes outside 4000-4999 (e.g.
# 6318, 7844, 9057) as projected AND projected codes inside
# 4000-4999 (e.g. 4087, 4088, 4499) as geographic, corrupting
# the CRS at write time.
model_type = _model_type_from_epsg(crs_epsg)
key_entries.append((GEOKEY_MODEL_TYPE, 0, 1, model_type))
num_keys += 1
elif crs_wkt is not None:
Expand Down
1 change: 1 addition & 0 deletions xrspatial/geotiff/tests/test_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -2780,6 +2780,7 @@ def test_all_lists_supported_functions(self):
'MixedBandMetadataError',
'NonUniformCoordsError',
'RotatedTransformError',
'UnknownCRSModelTypeError',
'UnparseableCRSError',
'GeoTIFFFallbackWarning',
'UnsafeURLError',
Expand Down
141 changes: 141 additions & 0 deletions xrspatial/geotiff/tests/test_geotags.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,147 @@ def test_projected_crs_geokey(self):
assert 3072 in geokeys # GEOKEY_PROJECTED_CS_TYPE


class TestModelTypeFromEPSG:
"""Regression coverage for issue #2277.

The legacy heuristic at build_geo_tags decided geographic-vs-projected
from the EPSG number range (4326 plus 4000-4999). That silently
mis-tagged geographic CRSes registered outside the 4000-4999 block
(NAD83(2011) = 6318, GDA2020 = 7844, WGS 84 (G2139) = 9057, etc.) as
projected, AND mis-tagged projected codes inside the block (4087 /
4088 / 4499 etc.) as geographic. Both directions corrupt the CRS at
write time.

The fix consults pyproj when available; when pyproj can't classify
a code (uninstalled, or installed but DB lookup fails), anything
outside a small vetted geographic allowlist raises rather than
guessing.
"""

@pytest.mark.parametrize("epsg", [4326, 4269, 4267, 4258, 4283])
def test_geographic_allowlist(self, epsg):
# Codes the hard-coded fallback set covers explicitly.
gt = GeoTransform(-120.0, 45.0, 0.001, -0.001)
tags = build_geo_tags(gt, crs_epsg=epsg)
geokeys = tags[TAG_GEO_KEY_DIRECTORY]
assert GEOKEY_GEOGRAPHIC_TYPE in geokeys
assert GEOKEY_PROJECTED_CS_TYPE not in geokeys

@pytest.mark.parametrize("epsg", [6318, 7844, 9057, 8252])
def test_geographic_outside_legacy_range(self, epsg):
# The bug: pre-fix, these wrote ProjectedCSTypeGeoKey.
# pyproj.CRS.from_epsg(...).is_geographic is True for all four.
pytest.importorskip("pyproj")
gt = GeoTransform(-120.0, 45.0, 0.001, -0.001)
tags = build_geo_tags(gt, crs_epsg=epsg)
geokeys = tags[TAG_GEO_KEY_DIRECTORY]
assert GEOKEY_GEOGRAPHIC_TYPE in geokeys, (
f"EPSG:{epsg} is geographic per pyproj but was written as "
f"projected -- regression of issue #2277"
)
assert GEOKEY_PROJECTED_CS_TYPE not in geokeys

@pytest.mark.parametrize("epsg", [4087, 4088, 4499])
def test_projected_inside_legacy_range(self, epsg):
# The other direction of the legacy bug: codes in 4000-4999 that
# are actually projected (World Equidistant Cylindrical at 4087 /
# 4088, CGCS2000 Gauss-Kruger zone 21 at 4499). The fix routes
# these through pyproj, which classifies them correctly.
pytest.importorskip("pyproj")
gt = GeoTransform(500000.0, 4500000.0, 30.0, -30.0)
tags = build_geo_tags(gt, crs_epsg=epsg)
geokeys = tags[TAG_GEO_KEY_DIRECTORY]
assert GEOKEY_PROJECTED_CS_TYPE in geokeys, (
f"EPSG:{epsg} is projected per pyproj but was written as "
f"geographic -- regression of issue #2277"
)
assert GEOKEY_GEOGRAPHIC_TYPE not in geokeys

@pytest.mark.parametrize("epsg", [32610, 3857, 2193])
def test_projected_dispatch(self, epsg):
pytest.importorskip("pyproj")
gt = GeoTransform(500000.0, 4500000.0, 30.0, -30.0)
tags = build_geo_tags(gt, crs_epsg=epsg)
geokeys = tags[TAG_GEO_KEY_DIRECTORY]
assert GEOKEY_PROJECTED_CS_TYPE in geokeys
assert GEOKEY_GEOGRAPHIC_TYPE not in geokeys

def test_unknown_epsg_without_pyproj_raises(self, monkeypatch):
"""Fail closed when pyproj is unavailable and the code is unknown.

Mocks `pyproj` import failure to exercise the fallback path.
EPSG 7844 (GDA2020 geographic) is outside the hard-coded
allowlist, so without pyproj the writer cannot tell geographic
from projected and must raise -- silent corruption is worse
than an explicit error.
"""
import builtins
from xrspatial.geotiff._errors import UnknownCRSModelTypeError

real_import = builtins.__import__

def fake_import(name, *args, **kwargs):
if name == "pyproj" or name.startswith("pyproj."):
raise ImportError("simulated: pyproj not installed")
return real_import(name, *args, **kwargs)

monkeypatch.setattr(builtins, "__import__", fake_import)

gt = GeoTransform(-120.0, 45.0, 0.001, -0.001)
with pytest.raises(UnknownCRSModelTypeError, match="pyproj"):
build_geo_tags(gt, crs_epsg=7844)

def test_known_geographic_without_pyproj_still_works(self, monkeypatch):
"""The hard-coded allowlist keeps working when pyproj is gone.

EPSG 4326 (and the rest of the vetted allowlist) lets common
geographic rasters round-trip without a pyproj dependency.
"""
import builtins

real_import = builtins.__import__

def fake_import(name, *args, **kwargs):
if name == "pyproj" or name.startswith("pyproj."):
raise ImportError("simulated: pyproj not installed")
return real_import(name, *args, **kwargs)

monkeypatch.setattr(builtins, "__import__", fake_import)

gt = GeoTransform(-120.0, 45.0, 0.001, -0.001)
tags = build_geo_tags(gt, crs_epsg=4326)
geokeys = tags[TAG_GEO_KEY_DIRECTORY]
assert GEOKEY_GEOGRAPHIC_TYPE in geokeys
assert GEOKEY_PROJECTED_CS_TYPE not in geokeys

def test_unknown_epsg_with_pyproj_raises(self, monkeypatch):
"""pyproj raises on an unknown EPSG -> writer raises too.

Simulates the case where pyproj is installed but the EPSG code
isn't in its database (e.g. the local PROJ database is out of
date). The fallback set covers a small vetted allowlist;
anything else should raise rather than guess.
"""
from xrspatial.geotiff._errors import UnknownCRSModelTypeError

# Patch CRS.from_epsg to simulate a missing DB entry.
import pyproj

def boom(code):
raise pyproj.exceptions.CRSError(f"simulated unknown code {code}")

monkeypatch.setattr(pyproj.CRS, "from_epsg", staticmethod(boom))

gt = GeoTransform(0.0, 0.0, 30.0, -30.0)
# 9999999 is outside the hard-coded allowlist.
with pytest.raises(UnknownCRSModelTypeError):
build_geo_tags(gt, crs_epsg=9999999)

# And the allowlist still rescues EPSG 4326 even when pyproj fails.
tags = build_geo_tags(gt, crs_epsg=4326)
assert GEOKEY_GEOGRAPHIC_TYPE in tags[TAG_GEO_KEY_DIRECTORY]


def _build_tiff_with_transformation_tag(matrix_16: tuple) -> bytes:
"""Build a tiny single-strip TIFF carrying a 4x4 ModelTransformationTag.

Expand Down
Loading