From b93c5c49a866e6523ca99d9efb9b7eb15fb154f1 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Thu, 14 May 2026 21:48:06 -0700 Subject: [PATCH 1/2] geotiff: extract writers to _writers/, reduce __init__.py to dispatch (#1888) Final step (10) of #1813's multi-PR refactor of __init__.py. Pure code motion; no public API change. Created the _writers/ subpackage with three modules: - _writers/eager.py: ``to_geotiff`` (public eager writer), ``_write_single_tile`` (per-tile worker), and ``_write_vrt_tiled`` (deprecated vrt_tiled=True path). ~870 lines. - _writers/gpu.py: ``write_geotiff_gpu`` (nvCOMP-backed GPU writer with CPU fallback). ~500 lines. - _writers/vrt.py: ``write_vrt`` (mosaic XML generator with crs_wkt deprecation handling). ~90 lines. __init__.py re-exports the three public writer names. The two private helpers tests reach for (``_write_single_tile``, ``_apply_nodata_mask_gpu``) keep their package-level re-exports with ``# noqa: F401`` markers since the test files import them directly. Final shape of __init__.py: 529 lines (down from 1905 pre-PR and 4902 at the start of #1813 series). It now holds only: - Module docstring + ``__all__`` (lines 1-112). - ``_read_geo_info``: metadata-only header parse used by ``open_geotiff`` and lazy-imported by ``_backends/dask.py``. - ``open_geotiff``: the public auto-dispatching read entry point that routes between the four read backends and ``read_vrt``. - ``plot_geotiff``: deprecated wrapper that emits a ``DeprecationWarning`` pointing to ``da.xrs.plot()``. Test updates ============ - test_to_geotiff_gpu_fallback_1674.py monkeypatches ``write_geotiff_gpu`` and ``_is_gpu_data`` as resolved by ``to_geotiff``. The patch targets move from ``xrspatial.geotiff`` to ``xrspatial.geotiff._writers.eager`` so the spy still intercepts the real call sites. Same module-global-rebind pattern as #1895 and #1897. Net change since the start of #1813 ==================================== - __init__.py: 4902 -> 529 lines (-4373; -89%). - New per-responsibility modules under xrspatial/geotiff/: _runtime.py (#1880), _crs.py (#1881), _validation.py (#1882), _attrs.py (#1883), _backends/_gpu_helpers.py (#1884), _backends/gpu.py (#1885), _backends/dask.py (#1886), _backends/vrt.py (#1887), _writers/{eager,gpu,vrt}.py (this PR). - One new regression-backstop test: test_backend_pixel_parity_matrix_1813.py (#1879), 192 parametrised cells covering pixel/coord/attrs parity across the four backends and four read entry points. Verification: 2862/2862 of the geotiff test suite pass (the 8 remaining pre-existing main failures from #1517 and #1776 are unrelated to #1813 and were excluded as deselected throughout). Closes #1888. Closes #1813. --- xrspatial/geotiff/__init__.py | 1384 +---------------- xrspatial/geotiff/_writers/__init__.py | 7 + xrspatial/geotiff/_writers/eager.py | 876 +++++++++++ xrspatial/geotiff/_writers/gpu.py | 513 ++++++ xrspatial/geotiff/_writers/vrt.py | 90 ++ .../test_to_geotiff_gpu_fallback_1674.py | 17 +- 6 files changed, 1502 insertions(+), 1385 deletions(-) create mode 100644 xrspatial/geotiff/_writers/__init__.py create mode 100644 xrspatial/geotiff/_writers/eager.py create mode 100644 xrspatial/geotiff/_writers/gpu.py create mode 100644 xrspatial/geotiff/_writers/vrt.py diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index 0320d70f..7f61e514 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -92,6 +92,11 @@ _validate_tile_size_arg, ) from ._writer import write +from ._writers.eager import to_geotiff +# Re-export only; called by xrspatial/geotiff/tests/test_nodata_no_extra_copy_1553.py. +from ._writers.eager import _write_single_tile # noqa: F401 +from ._writers.gpu import write_geotiff_gpu +from ._writers.vrt import write_vrt # All names below are part of the supported public API. ``plot_geotiff`` # is intentionally omitted: it is deprecated in favour of ``da.xrs.plot()`` @@ -509,1385 +514,6 @@ def open_geotiff(source: str | BinaryIO, *, return da -def to_geotiff(data: xr.DataArray | np.ndarray, - path: str | BinaryIO, *, - crs: int | str | None = None, - nodata: float | int | None = None, - compression: str = 'zstd', - compression_level: int | None = None, - tiled: bool = True, - tile_size: int = 256, - predictor: bool | int = False, - cog: bool = False, - overview_levels: list[int] | None = None, - overview_resampling: str = 'mean', - bigtiff: bool | None = None, - gpu: bool | None = None, - streaming_buffer_bytes: int = 256 * 1024 * 1024, - max_z_error: float = 0.0, - photometric: str | int = 'auto') -> None: - """Write data as a GeoTIFF or Cloud Optimized GeoTIFF. - - Dask-backed DataArrays are written in streaming mode: one tile-row - at a time, without materialising the full array into RAM. Peak - memory is roughly ``tile_size * width * bytes_per_sample``. COG - output (``cog=True``) still materialises because overviews need the - full array. - - Automatically dispatches to GPU compression when: - - ``gpu=True`` is passed, or - - The input data is CuPy-backed (auto-detected) - - GPU write uses nvCOMP batch compression (deflate/ZSTD) and keeps - the array on device. Falls back to CPU if nvCOMP is not available. - - Parameters - ---------- - data : xr.DataArray or np.ndarray - 2D raster data. - path : str or binary file-like - Output file path, or any object exposing a ``write`` method - (e.g. ``io.BytesIO``). When a file-like is passed, the encoded - TIFF bytes are written to that object once assembly completes. - ``cog=True`` and ``.vrt`` outputs require a string path. - crs : int, str, or None - EPSG code (int), WKT string, or PROJ string. If None and data - is a DataArray, tries to read from attrs ('crs' for EPSG, - 'crs_wkt' for WKT). - - EPSG codes are strongly preferred for interop. The WKT-only - path writes ``ProjectedCSType`` / ``GeographicType`` = 32767 - with the WKT stored in ``GTCitationGeoKey`` -- libgeotiff and - GDAL can round-trip this but many other GeoTIFF readers treat - the citation as a free-form name and lose the CRS. A - ``UserWarning`` is emitted when the WKT-only path is taken. - See issue #1768. - nodata : float, int, or None - NoData value. - compression : str - Codec name. One of ``'none'``, ``'deflate'``, ``'lzw'``, - ``'jpeg'``, ``'packbits'``, ``'zstd'``, ``'lz4'``, - ``'jpeg2000'`` (alias ``'j2k'``), or ``'lerc'``. - ``'jpeg'`` is currently rejected on write because the encoder - omits the JPEGTables tag and produced files do not round-trip - through libtiff / GDAL / rasterio. Use ``'deflate'``, ``'zstd'``, - or ``'lzw'`` instead. ``'lerc'`` accepts ``max_z_error`` for - lossy compression with a bounded per-pixel error. - compression_level : int or None - Compression effort level. None uses each codec's default (6 for - deflate/zstd). Valid ranges: deflate 1-9, zstd 1-22, lz4 0-16. - Codecs without a level concept (lzw, packbits, jpeg) accept any - value and ignore it. - tiled : bool - Use tiled layout (default True). - tile_size : int - Tile size in pixels (default 256). Must be a positive multiple - of 16 when ``tiled=True``; this is a TIFF 6 spec requirement - on TileWidth and TileLength for broad reader compatibility. - Ignored when ``tiled=False``; a warning is emitted if a - non-default value is passed alongside strip mode. - predictor : bool or int - TIFF predictor. Accepted values: - - * ``False``, ``0``, or ``1`` -> no predictor. - * ``True`` or ``2`` -> horizontal differencing (good for integer - data; ``True`` and ``2`` are exactly equivalent). - * ``3`` -> floating-point predictor (float dtypes only; typically - gives better deflate/zstd ratios on float data than predictor 2). - cog : bool - Write as Cloud Optimized GeoTIFF. - overview_levels : list[int] or None - Overview decimation factors relative to full resolution. - Each entry must be a power-of-two integer >= 2, and the list - must be strictly increasing (e.g. ``[2, 4, 8]`` writes - overviews at 1/2, 1/4 and 1/8 of the full resolution). - Invalid values raise ``ValueError``. Only used when ``cog=True``. - If None and ``cog=True``, levels auto-generate as - ``[2, 4, 8, ...]`` until the next halving would fall below - ``tile_size`` (capped at 8 levels). - overview_resampling : str - Resampling method for overviews: 'mean' (default), 'nearest', - 'min', 'max', 'median', 'mode', or 'cubic'. - bigtiff : bool or None - Force BigTIFF (64-bit offsets). None (default) auto-promotes - when the estimated file size would exceed the classic-TIFF 4 GB - limit. Matches the same kwarg on ``write_geotiff_gpu``. - gpu : bool or None - Force GPU compression. None (default) auto-detects CuPy data. - streaming_buffer_bytes : int - Soft cap on bytes materialised per dask compute call when - streaming a dask-backed DataArray. Defaults to 256 MB. Wide - rasters whose tile-row exceeds this budget are split into - horizontal segments. Ignored for numpy / CuPy / COG paths. - max_z_error : float - Per-pixel error budget for LERC compression. ``0.0`` (default) - is lossless; larger values let the encoder approximate values - within the bound, producing smaller files at the cost of accuracy - bounded by ``abs(decoded - original) <= max_z_error``. Only used - when ``compression='lerc'``; passing a non-zero value with any - other codec raises ``ValueError``. - photometric : str or int - Photometric interpretation for the TIFF Photometric tag (262). - - * ``'auto'`` (default) -- MinIsBlack (1) for any band count. - ExtraSamples for every band beyond the first is tagged ``0`` - (unspecified). Multispectral rasters (e.g. R, G, B, NIR) - round-trip through this default without being silently - labelled as RGB+alpha. Prior versions treated any 3+ band - array as RGB and the 4th band as unassociated alpha -- the - behaviour change is intentional (issue #1769). - * ``'rgb'`` -- RGB (Photometric=2). Three colour bands; any - additional bands are tagged ``0`` (unspecified). - * ``'rgba'`` -- RGB with the 4th band tagged as unassociated - alpha (TIFF ExtraSamples=2). Requires at least 4 bands. - * ``'minisblack'`` or ``'miniswhite'`` -- grayscale; multi-band - extras tagged ``0``. - * An ``int`` -- written verbatim into Photometric for advanced - callers (e.g. ``3`` for Palette, ``5`` for CMYK). - - A user-supplied ``extra_tags`` entry of ``(TAG_PHOTOMETRIC, - ...)`` or ``(TAG_EXTRA_SAMPLES, ...)`` overrides the writer's - chosen value; only these two tag ids are overridable so other - auto-emitted tags such as ``ImageWidth`` or ``StripOffsets`` - remain protected. - - Raises - ------ - ValueError - If ``data.attrs['transform']`` is a rotated or skewed affine - (``b != 0`` or ``d != 0`` in rasterio ``Affine`` ordering). The - on-disk GeoTIFF is axis-aligned; reproject onto an axis-aligned - grid first. - ValueError - If ``data`` is a 3D DataArray whose ``dims`` is not - ``(band, y, x)`` or ``(y, x, band)`` (accepting the band-name - aliases ``bands`` / ``channel`` and spatial-name aliases - ``lat`` / ``lon`` / ``latitude`` / ``longitude`` / ``row`` / - ``col``). A leading non-band dim such as ``time`` is rejected - because the writer cannot infer the band axis from arbitrary - names and used to silently treat the leading axis as ``y`` - (issue #1812). - """ - from ._reader import _coerce_path - - path = _coerce_path(path) - - # tiled=False ignores tile_size, so only validate when tiled output - # is requested. Shared with write_geotiff_gpu via - # _validate_tile_size_arg so both writers keep identical validation. - if tiled: - _validate_tile_size_arg(tile_size) - - # Up-front validation: catch bad compression names before they reach - # any of the deeper write paths (streaming, GPU, VRT, COG) where the - # error surfaces from _compression_tag with a less obvious traceback. - if isinstance(compression, str): - if compression.lower() not in _VALID_COMPRESSIONS: - raise ValueError( - f"Unknown compression {compression!r}. " - f"Valid options: {list(_VALID_COMPRESSIONS)}.") - # JPEG-in-TIFF (compression=7) requires the JPEGTables tag (347) - # carrying the abbreviated quantization/Huffman tables. The - # current encoder writes a self-contained JFIF stream per - # tile/strip and omits JPEGTables, which makes the resulting - # files unreadable by libtiff / GDAL / rasterio: they reject the - # tile data with "TIFFReadEncodedStrip() failed". The internal - # reader round-trips because Pillow re-decodes the JFIF stream - # directly, masking the interop break. Refuse the write rather - # than emit files no other tool can decode. See issue tracking - # the proper JPEGTables fix for re-enabling this codec. - if compression.lower() == 'jpeg': - raise ValueError( - "compression='jpeg' is not supported: the encoder writes " - "self-contained JFIF streams without the required " - "JPEGTables tag (347), so other readers (libtiff, GDAL, " - "rasterio) reject the file. Use 'deflate', 'zstd', or " - "'lzw' instead.") - - # max_z_error only applies to LERC; reject negative values and reject - # non-zero values paired with any other codec so the caller learns the - # parameter was ignored before bytes hit disk. - if max_z_error < 0: - raise ValueError( - f"max_z_error must be >= 0, got {max_z_error}") - if max_z_error != 0 and ( - not isinstance(compression, str) - or compression.lower() != 'lerc'): - raise ValueError( - "max_z_error is only valid with compression='lerc'") - - # File-like (BytesIO etc.) destinations: the streaming, GPU, COG, and - # VRT writers all need a real filesystem path (atomic rename, overview - # passes, sidecar writes). Reject those combos up front so the user - # gets a clear error instead of a deep traceback. - _path_is_file_like = (not isinstance(path, str)) and hasattr(path, 'write') - if _path_is_file_like: - if cog: - raise ValueError( - "cog=True is not supported for file-like destinations. " - "Pass a string path or write to BytesIO without cog=True.") - elif not isinstance(path, str): - raise TypeError( - f"path must be a str or a binary file-like with a write() " - f"method, got {type(path).__name__}") - - _is_vrt_path = ( - isinstance(path, str) and path.lower().endswith('.vrt')) - - # tile_size only applies to tiled output; warn if the caller passed a - # non-default size alongside strip mode (it would otherwise be silently - # ignored). The VRT path always tiles, so the warning would be - # misleading there -- the VRT branch below rejects tiled=False up front - # instead. - if not tiled and tile_size != 256 and not _is_vrt_path: - warnings.warn( - f"tile_size={tile_size} is ignored when tiled=False " - "(strip layout). Pass tiled=True to use tile_size, or drop " - "tile_size to silence this warning.", - stacklevel=2, - ) - - # VRT tiled output (string paths only -- VRT writes a real .vrt file - # plus per-tile GeoTIFFs to a directory) - if _is_vrt_path: - if not tiled: - raise ValueError( - "tiled=False is not compatible with VRT output. " - "VRT writes a directory of tiled GeoTIFFs; pass " - "tiled=True or omit it.") - # The early ``if tiled: _validate_tile_size_arg(tile_size)`` check - # above already validates tile_size when tiled=True, but call it - # here as well so the VRT path stays self-contained against future - # changes to the early-validation gate (no-op on re-entry; either - # raises or returns). - _validate_tile_size_arg(tile_size) - if cog: - raise ValueError( - "cog=True is not compatible with VRT output. " - "VRT writes tiled GeoTIFFs, not a single COG.") - if overview_levels is not None: - raise ValueError( - "overview_levels is not compatible with VRT output. " - "VRT tiles do not include overviews.") - _write_vrt_tiled(data, path, - crs=crs, nodata=nodata, - compression=compression, - compression_level=compression_level, - tile_size=tile_size, - predictor=predictor, - bigtiff=bigtiff, - max_z_error=max_z_error, - photometric=photometric) - return - - # Auto-detect GPU data and dispatch to write_geotiff_gpu. ``gpu is - # None`` is the implicit "use whatever fits the data" path; preserve - # that distinction in the fallback warning below so users who never - # set ``gpu=True`` are not told their explicit request was dropped. - auto_detected_gpu = gpu is None - use_gpu = gpu if gpu is not None else _is_gpu_data(data) - if use_gpu and _path_is_file_like: - # write_geotiff_gpu's nvCOMP path materialises tile parts and then - # calls _write_bytes(path), which would write at the buffer's - # current cursor without truncating. More importantly, the GPU - # path was never tested with file-like destinations; refuse rather - # than silently produce something untested. - raise ValueError( - "gpu=True is not supported for file-like destinations. " - "Pass a string path (or set gpu=False).") - if use_gpu: - # GPU writer uses nvCOMP and does not support LERC; refuse rather - # than silently dropping the requested error budget. - if max_z_error != 0: - raise ValueError( - "max_z_error is not supported on the GPU writer " - "(nvCOMP has no LERC backend). Use the CPU path " - "(gpu=False) or omit max_z_error.") - # Strip output is not implemented on the GPU path; reject up - # front rather than silently producing a tiled file. - if not tiled: - raise ValueError( - "tiled=False is not supported on the GPU writer. " - "Pass gpu=False or omit tiled=False.") - try: - write_geotiff_gpu(data, path, crs=crs, nodata=nodata, - compression=compression, - compression_level=compression_level, - tiled=tiled, - tile_size=tile_size, - predictor=predictor, - cog=cog, - overview_levels=overview_levels, - overview_resampling=overview_resampling, - bigtiff=bigtiff, - streaming_buffer_bytes=streaming_buffer_bytes, - photometric=photometric) - return - except ImportError as e: - # ``write_geotiff_gpu`` raises ImportError when cupy itself - # can't be imported. nvCOMP absence doesn't surface here: - # ``_try_nvcomp_from_device_bufs`` returns None when the - # library can't load, and the writer drops to CPU - # compression internally instead of re-raising. Fall back - # to the CPU writer with a typed warning so callers see - # that gpu=True (or auto-detected CuPy data) didn't go - # through. Strict mode re-raises so CI can fail loudly on - # missing GPU stacks. - if _geotiff_strict_mode(): - raise - warnings.warn( - _gpu_fallback_warning_message(auto_detected_gpu, e), - GeoTIFFFallbackWarning, - stacklevel=2, - ) - except RuntimeError as e: - # Only fall back when the message names a GPU-availability - # problem; any other RuntimeError is a real bug in the GPU - # writer and the broad ``except (ImportError, Exception)`` - # used to hide it from the user. Keep the keyword list - # tight: nvCOMP / CUDA / no device / no GPU / cuInit cover - # the realistic "no GPU present" failure modes without - # masking, e.g., a CRS or compression error that happens to - # raise RuntimeError. Strict mode re-raises in either case. - _gpu_unavail_tokens = ( - 'nvcomp', 'cuda', 'no device', 'no gpu', 'cuinit', - ) - msg = str(e).lower() - if not any(tok in msg for tok in _gpu_unavail_tokens): - raise - if _geotiff_strict_mode(): - raise - warnings.warn( - _gpu_fallback_warning_message(auto_detected_gpu, e), - GeoTIFFFallbackWarning, - stacklevel=2, - ) - - geo_transform = None - epsg = None - wkt_fallback = None # WKT string when EPSG is not available - raster_type = RASTER_PIXEL_IS_AREA - x_res = None - y_res = None - res_unit = None - gdal_meta_xml = None - extra_tags_list = None - - # Resolve crs argument: can be int (EPSG) or str (WKT/PROJ) - if isinstance(crs, int): - epsg = crs - elif isinstance(crs, str): - epsg = _wkt_to_epsg(crs) # try to extract EPSG from WKT/PROJ - if epsg is None: - wkt_fallback = crs - - if isinstance(data, xr.DataArray): - raw = data.data - - # Extract metadata from DataArray attrs (no materialisation needed). - # Prefer attrs['transform'] (from open_geotiff) over the coord-derived - # transform: that path is bit-stable across round-trips, while - # _coords_to_transform can drift on fractional pixel sizes because - # x[1] - x[0] is computed in float64 from already-rounded coords. - if geo_transform is None: - geo_transform = _transform_from_attr(data.attrs.get('transform')) - if geo_transform is None: - geo_transform = _coords_to_transform(data) - if epsg is None and crs is None: - crs_attr = data.attrs.get('crs') - if isinstance(crs_attr, str): - epsg = _wkt_to_epsg(crs_attr) - if epsg is None and wkt_fallback is None: - wkt_fallback = crs_attr - elif crs_attr is not None: - epsg = int(crs_attr) - if epsg is None: - wkt = data.attrs.get('crs_wkt') - if isinstance(wkt, str): - epsg = _wkt_to_epsg(wkt) - if epsg is None and wkt_fallback is None: - wkt_fallback = wkt - if nodata is None: - nodata = _resolve_nodata_attr(data.attrs) - # Pull raster_type, gdal_metadata_xml, extra_tags (folded with - # the friendly image_description / extra_samples / colormap - # attrs), x/y_resolution, and resolution_unit via the shared - # helper so all three writers stay in lockstep. - _rich = _extract_rich_tags(data.attrs) - raster_type = _rich['raster_type'] - gdal_meta_xml = _rich['gdal_metadata_xml'] - extra_tags_list = _rich['extra_tags'] - x_res = _rich['x_resolution'] - y_res = _rich['y_resolution'] - res_unit = _rich['resolution_unit'] - - # Dask-backed: stream tiles to avoid materialising the full array. - # COG requires overviews from the full array, so it falls through - # to the eager path. Streaming write needs a real filesystem path - # (it builds a temp file then atomic-renames); for file-like - # destinations we materialise eagerly and assemble in-memory. - if hasattr(raw, 'dask') and not cog and not _path_is_file_like: - dask_arr = raw - # Reject ambiguous 3D layouts at the entry point so a leading - # non-band dim like ``('time', 'y', 'x')`` cannot silently - # round-trip as a TIFF whose ``y`` axis carries the time - # values (issue #1812). - if raw.ndim == 3: - _validate_3d_writer_dims(data.dims) - # Handle band-first dimension order (band, y, x) -> (y, x, band) - if raw.ndim == 3 and data.dims[0] in _BAND_DIM_NAMES: - import dask.array as da - dask_arr = da.moveaxis(raw, 0, -1) - if dask_arr.ndim not in (2, 3): - raise ValueError( - f"Expected 2D or 3D array, got {dask_arr.ndim}D") - # Validate compression_level - if compression_level is not None: - level_range = _LEVEL_RANGES.get(compression.lower()) - if level_range is not None: - lo, hi = level_range - if not (lo <= compression_level <= hi): - raise ValueError( - f"compression_level={compression_level} out of " - f"range for {compression} (valid: {lo}-{hi})") - from ._writer import write_streaming - write_streaming( - dask_arr, path, - geo_transform=geo_transform, - crs_epsg=epsg, - crs_wkt=wkt_fallback if epsg is None else None, - nodata=nodata, - compression=compression, - compression_level=compression_level, - tiled=tiled, - tile_size=tile_size, - predictor=predictor, - raster_type=raster_type, - x_resolution=x_res, - y_resolution=y_res, - resolution_unit=res_unit, - gdal_metadata_xml=gdal_meta_xml, - extra_tags=extra_tags_list, - bigtiff=bigtiff, - streaming_buffer_bytes=streaming_buffer_bytes, - max_z_error=max_z_error, - photometric=photometric, - ) - return - - # Eager compute (numpy, CuPy, or dask+COG) - if hasattr(raw, 'get'): - arr = raw.get() # CuPy -> numpy - elif hasattr(raw, 'compute'): - arr = raw.compute() # Dask -> numpy - if hasattr(arr, 'get'): - arr = arr.get() # Dask+CuPy -> numpy - else: - arr = np.asarray(raw) - # Reject ambiguous 3D layouts (issue #1812). The validator runs - # on ``data.dims`` (the original DataArray's dim names) rather - # than on ``arr`` so the error fires before the move-axis even - # for COG and file-like destinations that fall through here. - if arr.ndim == 3: - _validate_3d_writer_dims(data.dims) - # Handle band-first dimension order (band, y, x) -> (y, x, band) - if arr.ndim == 3 and data.dims[0] in _BAND_DIM_NAMES: - arr = np.moveaxis(arr, 0, -1) - else: - if hasattr(data, 'get'): - arr = data.get() # CuPy -> numpy - else: - arr = np.asarray(data) - - if arr.ndim not in (2, 3): - raise ValueError(f"Expected 2D or 3D array, got {arr.ndim}D") - - # Auto-promote unsupported dtypes - if arr.dtype == np.float16: - arr = arr.astype(np.float32) - elif arr.dtype == np.bool_: - arr = arr.astype(np.uint8) - - # Restore NaN pixels to the nodata sentinel value so the written file - # has sentinel values matching the GDAL_NODATA tag. - # - # The defensive ``arr.copy()`` here is intentional: ``arr`` may be - # ``np.asarray(raw)`` of a caller-owned numpy DataArray (a view, - # not a copy) or ``np.moveaxis(arr, 0, -1)`` of one (also a view). - # Mutating without a copy would corrupt the user's input buffer. - if nodata is not None and arr.dtype.kind == 'f' and not np.isnan(nodata): - nan_mask = np.isnan(arr) - if nan_mask.any(): - arr = arr.copy() - arr[nan_mask] = arr.dtype.type(nodata) - - # Validate compression_level against codec-specific range - if compression_level is not None: - level_range = _LEVEL_RANGES.get(compression.lower()) - if level_range is not None: - lo, hi = level_range - if not (lo <= compression_level <= hi): - raise ValueError( - f"compression_level={compression_level} out of range " - f"for {compression} (valid: {lo}-{hi})") - - write( - arr, path, - geo_transform=geo_transform, - crs_epsg=epsg, - crs_wkt=wkt_fallback if epsg is None else None, - nodata=nodata, - compression=compression, - compression_level=compression_level, - tiled=tiled, - tile_size=tile_size, - predictor=predictor, - cog=cog, - overview_levels=overview_levels, - overview_resampling=overview_resampling, - raster_type=raster_type, - x_resolution=x_res, - y_resolution=y_res, - resolution_unit=res_unit, - gdal_metadata_xml=gdal_meta_xml, - extra_tags=extra_tags_list, - bigtiff=bigtiff, - max_z_error=max_z_error, - photometric=photometric, - ) - - -def _write_single_tile(chunk_data, path, geo_transform, epsg, wkt, - nodata, compression, compression_level, - tile_size, predictor, bigtiff, - max_z_error: float = 0.0, - raster_type: int = RASTER_PIXEL_IS_AREA, - x_resolution=None, - y_resolution=None, - resolution_unit=None, - gdal_metadata_xml=None, - extra_tags=None, - photometric: str | int = 'auto'): - """Write a single tile GeoTIFF. Used by _write_vrt_tiled. - - Forwards the same rich-tag set that ``to_geotiff`` passes through to - ``write`` (raster_type, x/y resolution, GDAL metadata, extra tags) so - every per-tile file under a VRT carries the same metadata it would - have received from a single-file ``to_geotiff(..., out.tif)`` write. - Without this, ``to_geotiff(da, "out.vrt")`` silently drops everything - except the per-tile geo_transform / crs / nodata. See issue #1606. - """ - if hasattr(chunk_data, 'compute'): - chunk_data = chunk_data.compute() - if hasattr(chunk_data, 'get'): - chunk_data = chunk_data.get() # CuPy -> numpy - - arr = np.asarray(chunk_data) - - # Auto-promote unsupported dtypes - if arr.dtype == np.float16: - arr = arr.astype(np.float32) - elif arr.dtype == np.bool_: - arr = arr.astype(np.uint8) - - # Restore NaN to nodata sentinel. - # - # The defensive ``arr.copy()`` here is intentional: ``arr`` came - # from ``np.asarray(chunk_data)`` where ``chunk_data`` may be a - # caller-owned numpy buffer. Mutating without a copy would corrupt - # the user's input. - if nodata is not None and arr.dtype.kind == 'f' and not np.isnan(nodata): - nan_mask = np.isnan(arr) - if nan_mask.any(): - arr = arr.copy() - arr[nan_mask] = arr.dtype.type(nodata) - - write(arr, path, - geo_transform=geo_transform, - crs_epsg=epsg, - crs_wkt=wkt if epsg is None else None, - nodata=nodata, - compression=compression, - tiled=True, - tile_size=tile_size, - predictor=predictor, - compression_level=compression_level, - raster_type=raster_type, - x_resolution=x_resolution, - y_resolution=y_resolution, - resolution_unit=resolution_unit, - gdal_metadata_xml=gdal_metadata_xml, - extra_tags=extra_tags, - bigtiff=bigtiff, - max_z_error=max_z_error, - photometric=photometric) - - -def _write_vrt_tiled(data, vrt_path, *, crs=None, nodata=None, - compression='zstd', compression_level=None, - tile_size=256, predictor: bool | int = False, - bigtiff=None, max_z_error: float = 0.0, - photometric: str | int = 'auto'): - """Write a DataArray as a directory of tiled GeoTIFFs with a VRT index. - - This enables streaming dask arrays to disk without materializing the - full array in RAM. - """ - import os - - # Validate compression_level against codec-specific range - if compression_level is not None: - level_range = _LEVEL_RANGES.get(compression.lower()) - if level_range is not None: - lo, hi = level_range - if not (lo <= compression_level <= hi): - raise ValueError( - f"compression_level={compression_level} out of range " - f"for {compression} (valid: {lo}-{hi})") - - # Derive tiles directory from VRT path stem - vrt_dir = os.path.dirname(os.path.abspath(vrt_path)) - stem = os.path.splitext(os.path.basename(vrt_path))[0] - tiles_dir_name = stem + '_tiles' - tiles_dir = os.path.join(vrt_dir, tiles_dir_name) - - # Validate tiles directory - if os.path.isdir(tiles_dir) and os.listdir(tiles_dir): - raise FileExistsError( - f"Tiles directory already contains files: {tiles_dir}") - os.makedirs(tiles_dir, exist_ok=True) - - # Resolve CRS - epsg = None - wkt_fallback = None - if isinstance(crs, int): - epsg = crs - elif isinstance(crs, str): - epsg = _wkt_to_epsg(crs) - if epsg is None: - wkt_fallback = crs - - geo_transform = None - raster_type = RASTER_PIXEL_IS_AREA - x_res = None - y_res = None - res_unit = None - gdal_meta_xml = None - extra_tags_list = None - - if isinstance(data, xr.DataArray): - raw = data.data - if epsg is None and crs is None: - crs_attr = data.attrs.get('crs') - if isinstance(crs_attr, str): - epsg = _wkt_to_epsg(crs_attr) - if epsg is None and wkt_fallback is None: - wkt_fallback = crs_attr - elif crs_attr is not None: - epsg = int(crs_attr) - if epsg is None: - wkt = data.attrs.get('crs_wkt') - if isinstance(wkt, str): - epsg = _wkt_to_epsg(wkt) - if epsg is None and wkt_fallback is None: - wkt_fallback = wkt - if nodata is None: - # Use the same alias-aware resolver that to_geotiff / - # write_geotiff_gpu apply so a rioxarray-style DataArray - # (``attrs['nodatavals']``) or a CF-style one - # (``attrs['_FillValue']``) round-trips through ``.vrt`` - # the same way it does through ``.tif``. Before this fix - # the VRT path used ``attrs.get('nodata')`` directly and - # silently dropped both aliases (issue #1606). - nodata = _resolve_nodata_attr(data.attrs) - geo_transform = _transform_from_attr(data.attrs.get('transform')) - if geo_transform is None: - geo_transform = _coords_to_transform(data) - # Pull the same rich-tag set that to_geotiff forwards to - # ``write`` so per-tile files under the VRT carry it too. - _rich = _extract_rich_tags(data.attrs) - raster_type = _rich['raster_type'] - gdal_meta_xml = _rich['gdal_metadata_xml'] - extra_tags_list = _rich['extra_tags'] - x_res = _rich['x_resolution'] - y_res = _rich['y_resolution'] - res_unit = _rich['resolution_unit'] - else: - raw = data - - # Check for dask backing - is_dask = hasattr(raw, 'dask') - - if is_dask: - if raw.ndim != 2: - raise ValueError( - "VRT tiled output currently supports 2D arrays only, " - f"got {raw.ndim}D. Squeeze or select a band first.") - # Use dask chunk grid - import dask - row_chunks = raw.chunks[0] # tuple of chunk sizes along y - col_chunks = raw.chunks[1] # tuple of chunk sizes along x - n_row_tiles = len(row_chunks) - n_col_tiles = len(col_chunks) - else: - # Numpy: tile using tile_size - if hasattr(raw, 'get'): - np_arr = raw.get() # CuPy - elif hasattr(raw, 'compute'): - np_arr = raw.compute() - else: - np_arr = np.asarray(raw) - if np_arr.ndim != 2: - raise ValueError( - "VRT tiled output currently supports 2D arrays only, " - f"got {np_arr.ndim}D. Squeeze or select a band first.") - height, width = np_arr.shape[:2] - n_row_tiles = (height + tile_size - 1) // tile_size - n_col_tiles = (width + tile_size - 1) // tile_size - - # Zero-padding width for tile names - pad_width = max(2, len(str(max(n_row_tiles, n_col_tiles) - 1))) - - tile_paths = [] - delayed_tasks = [] - - row_offset = 0 - for ri in range(n_row_tiles): - if is_dask: - chunk_h = row_chunks[ri] - else: - chunk_h = min(tile_size, height - row_offset) - - col_offset = 0 - for ci in range(n_col_tiles): - if is_dask: - chunk_w = col_chunks[ci] - else: - chunk_w = min(tile_size, width - col_offset) - - tile_name = f'tile_{ri:0{pad_width}d}_{ci:0{pad_width}d}.tif' - tile_path = os.path.join(tiles_dir, tile_name) - tile_paths.append(tile_path) - - # Compute per-tile geo_transform - tile_gt = None - if geo_transform is not None: - tile_gt = GeoTransform( - origin_x=geo_transform.origin_x + col_offset * geo_transform.pixel_width, - origin_y=geo_transform.origin_y + row_offset * geo_transform.pixel_height, - pixel_width=geo_transform.pixel_width, - pixel_height=geo_transform.pixel_height, - ) - - if is_dask: - # Slice the dask array for this chunk - r_end = row_offset + chunk_h - c_end = col_offset + chunk_w - chunk_data = raw[row_offset:r_end, col_offset:c_end] - - task = dask.delayed(_write_single_tile)( - chunk_data, tile_path, tile_gt, epsg, wkt_fallback, - nodata, compression, compression_level, - tile_size, predictor, bigtiff, max_z_error, - raster_type=raster_type, - x_resolution=x_res, - y_resolution=y_res, - resolution_unit=res_unit, - gdal_metadata_xml=gdal_meta_xml, - extra_tags=extra_tags_list, - photometric=photometric) - delayed_tasks.append(task) - else: - # Numpy: slice and write directly - chunk_data = np_arr[row_offset:row_offset + chunk_h, - col_offset:col_offset + chunk_w] - _write_single_tile( - chunk_data, tile_path, tile_gt, epsg, wkt_fallback, - nodata, compression, compression_level, - tile_size, predictor, bigtiff, max_z_error, - raster_type=raster_type, - x_resolution=x_res, - y_resolution=y_res, - resolution_unit=res_unit, - gdal_metadata_xml=gdal_meta_xml, - extra_tags=extra_tags_list, - photometric=photometric) - - col_offset += chunk_w - row_offset += chunk_h - - # Execute all dask tasks. - # - # Each delayed task is an independent ``_write_single_tile`` call on - # a distinct output path, with no shared mutable Python state, so - # the writes are embarrassingly parallel. Using ``scheduler='threads'`` - # lets zlib / zstd / LZW release the GIL during compression and the - # OS coalesce concurrent writes; in a 256-tile zstd write on a - # 4096x4096 dask DataArray the wall time drops ~33% versus the - # ``synchronous`` scheduler this used to call (issue #1714). - if delayed_tasks: - import dask - dask.compute(*delayed_tasks, scheduler='threads') - - # Write VRT index with relative paths - from ._vrt import write_vrt as _write_vrt_fn - _write_vrt_fn(vrt_path, tile_paths, relative=True, nodata=nodata) - - -def write_geotiff_gpu(data: xr.DataArray | cupy.ndarray | np.ndarray, - path: str | BinaryIO, *, - crs: int | str | None = None, - nodata: float | int | None = None, - compression: str = 'zstd', - compression_level: int | None = None, - tiled: bool = True, - tile_size: int = 256, - predictor: bool | int = False, - cog: bool = False, - overview_levels: list[int] | None = None, - overview_resampling: str = 'mean', - bigtiff: bool | None = None, - max_z_error: float = 0.0, - streaming_buffer_bytes: int = 256 * 1024 * 1024, - photometric: str | int = 'auto', - allow_internal_only_jpeg: bool = False) -> None: - """Write a CuPy-backed DataArray as a GeoTIFF with GPU compression. - - Tiles are extracted and compressed on the GPU via nvCOMP, then - assembled into a TIFF file on CPU. The CuPy array stays on device - throughout compression -- only the compressed bytes transfer to CPU - for file writing. - - When ``cog=True``, generates overview pyramids on GPU and writes a - Cloud Optimized GeoTIFF with all IFDs at the file start for - efficient range-request access. - - Falls back to CPU compression if nvCOMP is not available. - - Parameters - ---------- - data : xr.DataArray (CuPy- or NumPy-backed), cupy.ndarray, or np.ndarray - 2D or 3D raster. CuPy-backed inputs stay on device; NumPy/Dask - inputs are uploaded via ``cupy.asarray(np.asarray(data))`` - before compression (matches ``to_geotiff`` parity). - path : str or binary file-like - Output file path or any object with a ``write`` method - (e.g. ``io.BytesIO``). ``cog=True`` requires a string path: - the auto-dispatch path through ``to_geotiff(gpu=True, cog=True)`` - rejects file-like destinations, and the explicit GPU writer - mirrors that rule (issue #1652). - crs : int, str, or None - EPSG code or WKT string. EPSG codes are strongly preferred for - interop; the WKT-only path emits a user-defined CRS (32767) with - the WKT stored in ``GTCitationGeoKey``, which many non-libgeotiff - readers ignore. A ``UserWarning`` is emitted when the WKT-only - path is taken. See issue #1768. - nodata : float, int, or None - NoData value. - compression : str - Codec name. Accepts the same set ``to_geotiff`` lists in its - own signature: ``'none'``, ``'deflate'``, ``'lzw'``, ``'jpeg'``, - ``'packbits'``, ``'zstd'``, ``'lz4'``, ``'jpeg2000'`` (alias - ``'j2k'``), or ``'lerc'``. - - Routing per codec: - - - ``'zstd'`` (default) and ``'deflate'``: nvCOMP batch - compression on the GPU -- the fastest paths and the reason to - use this entry point. - - ``'jpeg'``: rejected by default for parity with - ``to_geotiff``. Both writers emit self-contained JFIF tiles - and skip the required TIFF JPEGTables tag (347), so the - resulting files are unreadable by libtiff, GDAL, and - rasterio. Pass ``allow_internal_only_jpeg=True`` to opt in - to the experimental encode path; the writer then routes to - nvJPEG when libnvjpeg is loadable and falls back to Pillow - otherwise, and emits a ``GeoTIFFFallbackWarning`` so the - caller knows the output will not round-trip through external - readers. See issue #1845. - - ``'jpeg2000'`` and ``'j2k'``: nvJPEG2K GPU encode when - available, glymur CPU encode otherwise. The two paths are - not byte-for-byte identical (different libraries, different - default parameters); use ``to_geotiff`` if you need exact - CPU-writer parity. - - ``'lerc'``, ``'lzw'``, ``'packbits'``, and ``'lz4'``: no - nvCOMP/CUDA accelerator, so these fall through to the CPU - encoder for byte-stable parity with ``to_geotiff``. - compression_level : int or None - Compression effort level. Accepted for API compatibility but - currently ignored -- nvCOMP does not expose level control. - tiled : bool - Must be True (default). The GPU writer is tiled-only because - nvCOMP batch compression operates on per-tile streams; passing - ``tiled=False`` raises ``ValueError`` rather than silently - producing a tiled file. Accepted for API parity with - ``to_geotiff``. - tile_size : int - Tile size in pixels (default 256). Must be a positive multiple - of 16; this is a TIFF 6 spec requirement on TileWidth and - TileLength for broad reader compatibility. ``write_geotiff_gpu`` - is always tiled, so the check fires for every call. - predictor : bool or int - TIFF predictor. ``False``/``0``/``1`` -> none, ``True``/``2`` -> - horizontal differencing, ``3`` -> floating-point predictor - (float dtypes only). - cog : bool - Write as Cloud Optimized GeoTIFF with overviews. - overview_levels : list[int] or None - Overview decimation factors relative to full resolution. - Each entry must be a power-of-two integer >= 2, and the list - must be strictly increasing (e.g. ``[2, 4, 8]`` writes - overviews at 1/2, 1/4 and 1/8 of the full resolution). - Invalid values raise ``ValueError``. Only used when ``cog=True``. - If None and ``cog=True``, auto-generates ``[2, 4, 8, ...]`` by - halving until the smallest overview fits in a single tile. - overview_resampling : str - Resampling method for overviews: 'mean' (default), 'nearest', - 'min', 'max', 'median', 'mode', or 'cubic'. ``mode`` and - ``cubic`` fall back to the CPU implementation in - ``xrspatial.geotiff._writer`` so the GPU writer produces the - same overview bytes as the CPU writer. - bigtiff : bool or None - Force BigTIFF (64-bit offsets). None auto-promotes when the - estimated file size would exceed the classic-TIFF 4 GB limit. - max_z_error : float - Per-pixel error budget for LERC compression. The GPU writer - does not implement LERC (nvCOMP has no LERC backend), so any - non-zero value raises ``ValueError``. Accepted at the signature - level for API parity with ``to_geotiff``. - streaming_buffer_bytes : int - Accepted for API parity with ``to_geotiff``. The GPU writer - materialises the entire array on device and has no streaming - concept, so this kwarg is a no-op. Default matches - ``to_geotiff`` (256 MB) so callers passing the same kwargs to - either entry point see the same default and the same type. - photometric : str or int - Photometric interpretation for the TIFF Photometric tag (262). - See :func:`to_geotiff` for the full set of accepted values; the - GPU writer forwards this kwarg unchanged. Default ``'auto'`` - writes MinIsBlack for any band count, so a 4-band raster is - not silently tagged as RGB+alpha (issue #1769). - allow_internal_only_jpeg : bool - Opt in to the experimental ``compression='jpeg'`` encode path - (default ``False``). The encoder emits self-contained JFIF - tiles without the TIFF JPEGTables tag (347); the file decodes - through this library's reader but not through libtiff, GDAL, - or rasterio. With the flag set, the write proceeds and a - ``GeoTIFFFallbackWarning`` is emitted at call time. Without - the flag, ``compression='jpeg'`` raises ``ValueError`` for - parity with ``to_geotiff``. See issue #1845. - - Raises - ------ - ValueError - If ``data`` is a 3D DataArray whose ``dims`` is not - ``(band, y, x)`` or ``(y, x, band)`` (accepting band-name - aliases ``bands`` / ``channel`` and spatial-name aliases - ``lat`` / ``lon`` / ``row`` / ``col``). A leading non-band - dim such as ``time`` would otherwise silently round-trip with - the leading axis treated as ``y`` (issue #1812). - """ - if not tiled: - raise ValueError( - "write_geotiff_gpu requires tiled=True. nvCOMP batch " - "compression is tile-based; the strip layout is not " - "implemented on the GPU path. Use to_geotiff(..., gpu=False, " - "tiled=False) for strip output on CPU.") - # JPEG-in-TIFF parity with to_geotiff (issue #1845). The GPU encode - # path writes self-contained JFIF tiles without the TIFF JPEGTables - # tag (347), matching the broken CPU encoder. ``to_geotiff`` refuses - # the codec outright; this writer offered no rejection at all, so - # callers could produce GeoTIFFs that decoded through xrspatial but - # broke in libtiff/GDAL/rasterio. Reject by default with the same - # wording so both entry points agree, and surface an opt-in flag for - # users who explicitly want the internal-only path. - if (isinstance(compression, str) - and compression.lower() == 'jpeg' - and not allow_internal_only_jpeg): - raise ValueError( - "compression='jpeg' is not supported: the encoder writes " - "self-contained JFIF streams without the required " - "JPEGTables tag (347), so other readers (libtiff, GDAL, " - "rasterio) reject the file. Use 'deflate', 'zstd', or " - "'lzw' instead. Pass allow_internal_only_jpeg=True to opt " - "in to the experimental internal-reader-only path (issue " - "#1845).") - if (isinstance(compression, str) - and compression.lower() == 'jpeg' - and allow_internal_only_jpeg): - warnings.warn( - "write_geotiff_gpu(compression='jpeg', " - "allow_internal_only_jpeg=True) writes JFIF tiles without " - "the TIFF JPEGTables tag (347); the file decodes through " - "xrspatial but may fail in libtiff, GDAL, or rasterio. " - "See issue #1845.", - GeoTIFFFallbackWarning, - stacklevel=2, - ) - # MinIsWhite pre-inversion (issue #1836) runs in the eager CPU writer. - # The GPU writer assembles tile bytes directly on device; threading - # the pixel + nodata-sentinel transform through that pipeline is out - # of scope for the round-trip fix. Refuse the combination so callers - # do not silently get inverted on-disk values. Move the array to the - # CPU and call the eager ``write`` path for MinIsWhite output. - from ._writer import _resolve_photometric as _resolve_photo_gpu - _gpu_samples_hint = (data.shape[2] if hasattr(data, 'shape') - and data.ndim == 3 else 1) - _gpu_resolved_photo, _ = _resolve_photo_gpu( - photometric, _gpu_samples_hint) - if _gpu_resolved_photo == 0 and _gpu_samples_hint == 1: - raise NotImplementedError( - "photometric='miniswhite' on the GPU writer is not " - "supported: the writer-side pixel inversion that mirrors " - "the reader's unconditional MinIsWhite inversion (issue " - "#1836) is only wired into the eager CPU ``write`` path. " - "Move the array to host memory and call to_geotiff with " - "gpu=False, or write with photometric='minisblack' / " - "'auto'.") - # write_geotiff_gpu is always tiled, so validate tile_size here and - # keep parity with the public to_geotiff entry point. - _validate_tile_size_arg(tile_size) - if max_z_error < 0: - raise ValueError( - f"max_z_error must be >= 0, got {max_z_error}") - if max_z_error != 0: - raise ValueError( - "max_z_error is not supported on the GPU writer " - "(nvCOMP has no LERC backend). Use to_geotiff(..., gpu=False) " - "or omit max_z_error.") - # Mirror to_geotiff's path-type + cog=True gating verbatim so callers - # see identical errors from the two entry points. The auto-dispatch - # path through ``to_geotiff(gpu=True, cog=True, path=BytesIO)`` raises - # before reaching here; the explicit GPU writer mirrors the same gate - # so callers cannot bypass it (issue #1652). Non-cog file-like writes - # remain supported on this entry point. - _path_is_file_like = ( - not isinstance(path, str)) and hasattr(path, 'write') - if _path_is_file_like: - if cog: - raise ValueError( - "cog=True is not supported for file-like destinations. " - "Pass a string path or write to BytesIO without cog=True.") - elif not isinstance(path, str): - raise TypeError( - f"path must be a str or a binary file-like with a write() " - f"method, got {type(path).__name__}") - # streaming_buffer_bytes is intentionally a no-op on the GPU path; - # the kwarg exists for API parity with to_geotiff so callers can pass - # the same kwargs to both entry points without filtering. - del streaming_buffer_bytes - try: - import cupy - except ImportError: - raise ImportError("cupy is required for GPU writes") - - from ._gpu_decode import gpu_compress_tiles, make_overview_gpu - from ._writer import ( - _compression_tag, _assemble_tiff, _write_bytes, - normalize_predictor, - GeoTransform as _GT, - ) - from ._dtypes import numpy_to_tiff_dtype - - # Extract array and metadata - geo_transform = None - epsg = None - wkt_fallback = None # WKT string when EPSG is not available - raster_type = 1 - gdal_meta_xml = None - extra_tags_list = None - x_res = None - y_res = None - res_unit = None - - if isinstance(crs, int): - epsg = crs - elif isinstance(crs, str): - epsg = _wkt_to_epsg(crs) - if epsg is None: - wkt_fallback = crs - - if isinstance(data, xr.DataArray): - arr = data.data - # Handle Dask arrays: compute to materialize - if hasattr(arr, 'compute'): - arr = arr.compute() - # Now arr should be CuPy or numpy - if hasattr(arr, 'get'): - pass # CuPy array, already on GPU - else: - arr = cupy.asarray(np.asarray(arr)) # numpy -> GPU - - # Reject ambiguous 3D layouts (issue #1812). Mirrors the gate - # in ``to_geotiff``: a leading non-band dim like ``time`` would - # otherwise round-trip with the leading axis silently treated - # as ``y``. - if arr.ndim == 3: - _validate_3d_writer_dims(data.dims) - # Handle band-first dimension order (band, y, x) -> (y, x, band). - # rioxarray and CF-style multi-band rasters land here; without - # this remap the writer treats arr.shape[2] as the band axis and - # produces a transposed file (issue #1580). The CPU writer does - # the same remap at the matching step in to_geotiff(). - if arr.ndim == 3 and data.dims[0] in _BAND_DIM_NAMES: - arr = cupy.ascontiguousarray(cupy.moveaxis(arr, 0, -1)) - - # Prefer attrs['transform'] over the coord-derived transform: it - # is bit-stable across round-trips, while _coords_to_transform - # can drift on fractional pixel sizes (the same reasoning the - # CPU to_geotiff path applies for issue #1484). - geo_transform = _transform_from_attr(data.attrs.get('transform')) - if geo_transform is None: - geo_transform = _coords_to_transform(data) - # Resolve CRS the same way the CPU writer does. attrs['crs'] may - # be an int EPSG or a WKT string; attrs['crs_wkt'] only carries - # WKT. Without the WKT branch the GPU writer silently drops CRS - # on files whose original CRS only resolves to WKT (no recognized - # EPSG). - if epsg is None and crs is None: - crs_attr = data.attrs.get('crs') - if isinstance(crs_attr, str): - epsg = _wkt_to_epsg(crs_attr) - if epsg is None and wkt_fallback is None: - wkt_fallback = crs_attr - elif crs_attr is not None: - epsg = int(crs_attr) - if epsg is None: - wkt = data.attrs.get('crs_wkt') - if isinstance(wkt, str): - epsg = _wkt_to_epsg(wkt) - if epsg is None and wkt_fallback is None: - wkt_fallback = wkt - if nodata is None: - nodata = _resolve_nodata_attr(data.attrs) - # Mirror the CPU writer's pass-through of GDAL metadata, the - # extra_tags list, the friendly image_description / extra_samples - # / colormap synthesis, and the resolution tags. Without these, - # a GPU write -> CPU read round-trip silently drops every rich - # tag (#1563). - _rich = _extract_rich_tags(data.attrs) - raster_type = _rich['raster_type'] - gdal_meta_xml = _rich['gdal_metadata_xml'] - extra_tags_list = _rich['extra_tags'] - x_res = _rich['x_resolution'] - y_res = _rich['y_resolution'] - res_unit = _rich['resolution_unit'] - else: - if hasattr(data, 'compute'): - data = data.compute() # Dask -> CuPy or numpy - if hasattr(data, 'device'): - arr = data # already CuPy - elif hasattr(data, 'get'): - arr = data # CuPy - else: - arr = cupy.asarray(np.asarray(data)) # numpy/list -> GPU - - if arr.ndim not in (2, 3): - raise ValueError(f"Expected 2D or 3D array, got {arr.ndim}D") - - height, width = arr.shape[:2] - samples = arr.shape[2] if arr.ndim == 3 else 1 - np_dtype = np.dtype(str(arr.dtype)) # cupy dtype -> numpy dtype - - # Mirror the CPU writer's NaN-to-sentinel substitution (issue #1599). - # Without this step the GPU writer emits raw NaN bytes interleaved - # with valid data even when ``nodata=`` is supplied; the - # GDAL_NODATA tag still advertises the sentinel but external readers - # (rasterio / GDAL / QGIS) mask only on the sentinel value and - # therefore see the NaN pixels as valid data. The CPU writer does - # the equivalent rewrite at ``to_geotiff`` (lines around - # ``arr.copy(); arr[nan_mask] = arr.dtype.type(nodata)``); both - # paths must produce byte-equivalent files for the same input. - # We always copy before the in-place sentinel write. Some upstream - # branches above already produce a fresh buffer (``cupy.asarray`` - # from numpy/dask, ``ascontiguousarray`` from the band-first - # moveaxis); others (a CuPy-backed DataArray taking the no-moveaxis - # path, or a plain CuPy positional ``data``) hand ``arr`` back as - # the caller's buffer. Rather than tracking provenance across that - # branch tree, copy unconditionally when we are about to mutate -- - # the cost is one GPU array allocation, only on the NaN-present - # path, and it guarantees the CPU writer's defensive-copy semantics - # in every case. - if (nodata is not None - and np_dtype.kind == 'f' - and not np.isnan(float(nodata))): - nan_mask = cupy.isnan(arr) - if bool(nan_mask.any()): - arr = arr.copy() - arr[nan_mask] = np_dtype.type(nodata) - - comp_tag = _compression_tag(compression) - pred_val = normalize_predictor(predictor, np_dtype, comp_tag) - - def _gpu_compress_to_part(gpu_arr, w, h, spp): - """Compress a GPU array into a (stub, w, h, offsets, counts, tiles) tuple.""" - compressed = gpu_compress_tiles( - gpu_arr, tile_size, tile_size, w, h, - comp_tag, pred_val, np_dtype, spp) - rel_off = [] - bc = [] - off = 0 - for tile in compressed: - rel_off.append(off) - bc.append(len(tile)) - off += len(tile) - stub = np.empty((1, 1, spp) if spp > 1 else (1, 1), dtype=np_dtype) - return (stub, w, h, rel_off, bc, compressed) - - # Full resolution - parts = [_gpu_compress_to_part(arr, width, height, samples)] - - # Overview generation -- mirrors the CPU writer's 8-level cap. - if cog: - if overview_levels is None: - from ._writer import _MAX_OVERVIEW_LEVELS - # Auto-generated lists hold actual decimation factors (2, - # 4, 8, ...) so the loop below treats auto-generated and - # user-supplied lists identically (issue #1766). - overview_levels = [] - oh, ow = height, width - factor = 2 - while (oh > tile_size and ow > tile_size and - len(overview_levels) < _MAX_OVERVIEW_LEVELS): - oh //= 2 - ow //= 2 - if oh > 0 and ow > 0: - overview_levels.append(factor) - factor *= 2 - else: - # Validate explicit lists: power-of-two factors >= 2, - # strictly increasing, feasible for the input shape. - # Previously the values were ignored and only the list - # length mattered (issue #1766). - from ._writer import _validate_overview_levels - overview_levels = _validate_overview_levels( - overview_levels, height=height, width=width) - - # Pass ``nodata`` so the GPU reducer masks the sentinel back to - # NaN before averaging. Without this, the NaN->sentinel rewrite - # done above on ``arr`` leaks the sentinel into the overview - # reduction and poisons the pyramid (issue #1613). Rewrite any - # all-sentinel cell NaN back to the sentinel after each level - # so the on-disk overview tiles still carry the sentinel value - # external readers expect. - current = arr - cumulative_factor = 1 - for target_factor in overview_levels: - # Halve repeatedly until the cumulative decimation matches - # the requested factor. Validation has already established - # that ``target_factor`` is a power of two and strictly - # greater than ``cumulative_factor``. - while cumulative_factor < target_factor: - current = make_overview_gpu(current, method=overview_resampling, - nodata=nodata) - cumulative_factor *= 2 - if (nodata is not None - and np.dtype(str(current.dtype)).kind == 'f' - and not np.isnan(float(nodata))): - nan_mask = cupy.isnan(current) - if bool(nan_mask.any().item()): - current = current.copy() - current[nan_mask] = np.dtype( - str(current.dtype)).type(nodata) - oh, ow = current.shape[:2] - parts.append(_gpu_compress_to_part(current, ow, oh, samples)) - - file_bytes = _assemble_tiff( - width, height, np_dtype, comp_tag, pred_val, True, tile_size, - parts, geo_transform, epsg, nodata, - is_cog=(cog and len(parts) > 1), - raster_type=raster_type, - crs_wkt=wkt_fallback if epsg is None else None, - gdal_metadata_xml=gdal_meta_xml, - extra_tags=extra_tags_list, - x_resolution=x_res, - y_resolution=y_res, - resolution_unit=res_unit, - force_bigtiff=bigtiff, - photometric=photometric, - ) - - _write_bytes(file_bytes, path) - - - -def write_vrt(vrt_path: str, source_files: list[str], *, - relative: bool = True, - crs: int | str | None = None, - crs_wkt: str | None = _CRS_WKT_DEPRECATED_SENTINEL, - nodata: float | int | None = None) -> str: - """Generate a VRT file that mosaics multiple GeoTIFF tiles. - - Parameters - ---------- - vrt_path : str - Output .vrt file path. - source_files : list of str - Paths to the source GeoTIFF files. - relative : bool, optional - Store source paths relative to the VRT file (default True). - crs : int, str, or None, optional - EPSG code (int), WKT string, or PROJ string. If None, the CRS - is taken from the first source GeoTIFF. Mirrors the ``crs`` - kwarg on ``to_geotiff`` and ``write_geotiff_gpu`` so the same - value can be forwarded to whichever writer the caller picked - without per-writer special-casing (issue #1715). - crs_wkt : str or None, optional - Deprecated alias for ``crs``. Emits ``DeprecationWarning`` when - supplied (including ``crs_wkt=None``); passing both ``crs`` and - ``crs_wkt`` raises ``TypeError``. The value is forwarded through - the same ``_resolve_crs_to_wkt`` path as ``crs``, so any string - the resolver accepts (WKT root keyword, PROJ string, - ``"EPSG:NNNN"``) and ``None`` work here. The historic - ``str | None`` surface is preserved; new code should use ``crs`` - instead, which additionally accepts ``int`` EPSG codes. - nodata : float, int, or None, optional - NoData value. If None, taken from the first source GeoTIFF. - Integer sentinels (e.g. ``65535`` for uint16, ``-9999`` for - int32) are accepted so the surface lines up with the - ``nodata`` kwarg on ``to_geotiff`` and ``write_geotiff_gpu``. - - Returns - ------- - str - Path to the written VRT file. - """ - # Explicit signature (previously ``**kwargs``) so ``inspect.signature``, - # IDE autocomplete, and ``mypy --strict`` can see the accepted kwargs - # without parsing the docstring. Mirrors ``_vrt.write_vrt`` for the - # historic ``crs_wkt`` path; the new ``crs`` path normalises through - # ``_resolve_crs_to_wkt`` before forwarding because the internal - # writer still only speaks WKT. - crs_wkt_passed = crs_wkt is not _CRS_WKT_DEPRECATED_SENTINEL - if crs is not None and crs_wkt_passed: - # Both supplied is ambiguous regardless of whether the WKT happens - # to encode the same CRS as the int. Refuse rather than silently - # picking one. - raise TypeError( - "write_vrt: pass either 'crs' or the deprecated 'crs_wkt' " - "alias, not both.") - if crs_wkt_passed: - warnings.warn( - "write_vrt(..., crs_wkt=...) is deprecated; use crs=... " - "instead. The kwarg was renamed for parity with to_geotiff " - "and write_geotiff_gpu, which already accept 'crs' as either " - "an int EPSG code or a WKT string.", - DeprecationWarning, - stacklevel=2, - ) - crs = crs_wkt - - resolved_wkt = _resolve_crs_to_wkt(crs) - - from ._vrt import write_vrt as _write_vrt_internal - return _write_vrt_internal( - vrt_path, source_files, - relative=relative, - crs_wkt=resolved_wkt, - nodata=nodata, - ) - - def plot_geotiff(da: xr.DataArray, **kwargs): """Plot a DataArray using its embedded colormap if present. diff --git a/xrspatial/geotiff/_writers/__init__.py b/xrspatial/geotiff/_writers/__init__.py new file mode 100644 index 00000000..6f3bcaf2 --- /dev/null +++ b/xrspatial/geotiff/_writers/__init__.py @@ -0,0 +1,7 @@ +"""Writer entry points for the geotiff module. + +Step 10 of issue #1813. Holds ``to_geotiff`` (and its eager-path +helpers), ``write_geotiff_gpu``, and ``write_vrt`` in sibling modules. +The package ``__init__`` stays empty so nothing leaks into +``xrspatial.geotiff`` through implicit re-exports. +""" diff --git a/xrspatial/geotiff/_writers/eager.py b/xrspatial/geotiff/_writers/eager.py new file mode 100644 index 00000000..16d3f4fd --- /dev/null +++ b/xrspatial/geotiff/_writers/eager.py @@ -0,0 +1,876 @@ +"""Eager (CPU) writer entry point and helpers. + +Step 10 of issue #1813. Holds ``to_geotiff`` (the public eager writer), +``_write_single_tile`` (per-tile worker used by ``_write_vrt_tiled``), +and ``_write_vrt_tiled`` (the deprecated ``vrt_tiled=True`` path on +``to_geotiff``). Companion modules ``_writers/gpu.py`` and +``_writers/vrt.py`` hold the GPU writer and the public ``write_vrt``; +``to_geotiff`` dispatches to them when the caller asks for a GPU +output or a ``.vrt`` path. +""" +from __future__ import annotations + +import math +import os +import warnings +from typing import TYPE_CHECKING + +import numpy as np +import xarray as xr + +if TYPE_CHECKING: + from typing import BinaryIO + +from .._attrs import ( + _LEVEL_RANGES, + _VALID_COMPRESSIONS, + _extract_rich_tags, + _resolve_nodata_attr, +) +from .._backends._gpu_helpers import _is_gpu_data +from .._coords import ( + _BAND_DIM_NAMES, + coords_to_transform as _coords_to_transform, + transform_from_attr as _transform_from_attr, +) +from .._crs import _wkt_to_epsg +from .._geotags import GeoTransform, RASTER_PIXEL_IS_AREA, RASTER_PIXEL_IS_POINT +from .._runtime import ( + GeoTIFFFallbackWarning, + _geotiff_strict_mode, + _gpu_fallback_warning_message, +) +from .._validation import ( + _validate_3d_writer_dims, + _validate_dtype_cast, + _validate_tile_size_arg, +) +from .._writer import write +from .gpu import write_geotiff_gpu +from .vrt import write_vrt + + +def to_geotiff(data: xr.DataArray | np.ndarray, + path: str | BinaryIO, *, + crs: int | str | None = None, + nodata: float | int | None = None, + compression: str = 'zstd', + compression_level: int | None = None, + tiled: bool = True, + tile_size: int = 256, + predictor: bool | int = False, + cog: bool = False, + overview_levels: list[int] | None = None, + overview_resampling: str = 'mean', + bigtiff: bool | None = None, + gpu: bool | None = None, + streaming_buffer_bytes: int = 256 * 1024 * 1024, + max_z_error: float = 0.0, + photometric: str | int = 'auto') -> None: + """Write data as a GeoTIFF or Cloud Optimized GeoTIFF. + + Dask-backed DataArrays are written in streaming mode: one tile-row + at a time, without materialising the full array into RAM. Peak + memory is roughly ``tile_size * width * bytes_per_sample``. COG + output (``cog=True``) still materialises because overviews need the + full array. + + Automatically dispatches to GPU compression when: + - ``gpu=True`` is passed, or + - The input data is CuPy-backed (auto-detected) + + GPU write uses nvCOMP batch compression (deflate/ZSTD) and keeps + the array on device. Falls back to CPU if nvCOMP is not available. + + Parameters + ---------- + data : xr.DataArray or np.ndarray + 2D raster data. + path : str or binary file-like + Output file path, or any object exposing a ``write`` method + (e.g. ``io.BytesIO``). When a file-like is passed, the encoded + TIFF bytes are written to that object once assembly completes. + ``cog=True`` and ``.vrt`` outputs require a string path. + crs : int, str, or None + EPSG code (int), WKT string, or PROJ string. If None and data + is a DataArray, tries to read from attrs ('crs' for EPSG, + 'crs_wkt' for WKT). + + EPSG codes are strongly preferred for interop. The WKT-only + path writes ``ProjectedCSType`` / ``GeographicType`` = 32767 + with the WKT stored in ``GTCitationGeoKey`` -- libgeotiff and + GDAL can round-trip this but many other GeoTIFF readers treat + the citation as a free-form name and lose the CRS. A + ``UserWarning`` is emitted when the WKT-only path is taken. + See issue #1768. + nodata : float, int, or None + NoData value. + compression : str + Codec name. One of ``'none'``, ``'deflate'``, ``'lzw'``, + ``'jpeg'``, ``'packbits'``, ``'zstd'``, ``'lz4'``, + ``'jpeg2000'`` (alias ``'j2k'``), or ``'lerc'``. + ``'jpeg'`` is currently rejected on write because the encoder + omits the JPEGTables tag and produced files do not round-trip + through libtiff / GDAL / rasterio. Use ``'deflate'``, ``'zstd'``, + or ``'lzw'`` instead. ``'lerc'`` accepts ``max_z_error`` for + lossy compression with a bounded per-pixel error. + compression_level : int or None + Compression effort level. None uses each codec's default (6 for + deflate/zstd). Valid ranges: deflate 1-9, zstd 1-22, lz4 0-16. + Codecs without a level concept (lzw, packbits, jpeg) accept any + value and ignore it. + tiled : bool + Use tiled layout (default True). + tile_size : int + Tile size in pixels (default 256). Must be a positive multiple + of 16 when ``tiled=True``; this is a TIFF 6 spec requirement + on TileWidth and TileLength for broad reader compatibility. + Ignored when ``tiled=False``; a warning is emitted if a + non-default value is passed alongside strip mode. + predictor : bool or int + TIFF predictor. Accepted values: + + * ``False``, ``0``, or ``1`` -> no predictor. + * ``True`` or ``2`` -> horizontal differencing (good for integer + data; ``True`` and ``2`` are exactly equivalent). + * ``3`` -> floating-point predictor (float dtypes only; typically + gives better deflate/zstd ratios on float data than predictor 2). + cog : bool + Write as Cloud Optimized GeoTIFF. + overview_levels : list[int] or None + Overview decimation factors relative to full resolution. + Each entry must be a power-of-two integer >= 2, and the list + must be strictly increasing (e.g. ``[2, 4, 8]`` writes + overviews at 1/2, 1/4 and 1/8 of the full resolution). + Invalid values raise ``ValueError``. Only used when ``cog=True``. + If None and ``cog=True``, levels auto-generate as + ``[2, 4, 8, ...]`` until the next halving would fall below + ``tile_size`` (capped at 8 levels). + overview_resampling : str + Resampling method for overviews: 'mean' (default), 'nearest', + 'min', 'max', 'median', 'mode', or 'cubic'. + bigtiff : bool or None + Force BigTIFF (64-bit offsets). None (default) auto-promotes + when the estimated file size would exceed the classic-TIFF 4 GB + limit. Matches the same kwarg on ``write_geotiff_gpu``. + gpu : bool or None + Force GPU compression. None (default) auto-detects CuPy data. + streaming_buffer_bytes : int + Soft cap on bytes materialised per dask compute call when + streaming a dask-backed DataArray. Defaults to 256 MB. Wide + rasters whose tile-row exceeds this budget are split into + horizontal segments. Ignored for numpy / CuPy / COG paths. + max_z_error : float + Per-pixel error budget for LERC compression. ``0.0`` (default) + is lossless; larger values let the encoder approximate values + within the bound, producing smaller files at the cost of accuracy + bounded by ``abs(decoded - original) <= max_z_error``. Only used + when ``compression='lerc'``; passing a non-zero value with any + other codec raises ``ValueError``. + photometric : str or int + Photometric interpretation for the TIFF Photometric tag (262). + + * ``'auto'`` (default) -- MinIsBlack (1) for any band count. + ExtraSamples for every band beyond the first is tagged ``0`` + (unspecified). Multispectral rasters (e.g. R, G, B, NIR) + round-trip through this default without being silently + labelled as RGB+alpha. Prior versions treated any 3+ band + array as RGB and the 4th band as unassociated alpha -- the + behaviour change is intentional (issue #1769). + * ``'rgb'`` -- RGB (Photometric=2). Three colour bands; any + additional bands are tagged ``0`` (unspecified). + * ``'rgba'`` -- RGB with the 4th band tagged as unassociated + alpha (TIFF ExtraSamples=2). Requires at least 4 bands. + * ``'minisblack'`` or ``'miniswhite'`` -- grayscale; multi-band + extras tagged ``0``. + * An ``int`` -- written verbatim into Photometric for advanced + callers (e.g. ``3`` for Palette, ``5`` for CMYK). + + A user-supplied ``extra_tags`` entry of ``(TAG_PHOTOMETRIC, + ...)`` or ``(TAG_EXTRA_SAMPLES, ...)`` overrides the writer's + chosen value; only these two tag ids are overridable so other + auto-emitted tags such as ``ImageWidth`` or ``StripOffsets`` + remain protected. + + Raises + ------ + ValueError + If ``data.attrs['transform']`` is a rotated or skewed affine + (``b != 0`` or ``d != 0`` in rasterio ``Affine`` ordering). The + on-disk GeoTIFF is axis-aligned; reproject onto an axis-aligned + grid first. + ValueError + If ``data`` is a 3D DataArray whose ``dims`` is not + ``(band, y, x)`` or ``(y, x, band)`` (accepting the band-name + aliases ``bands`` / ``channel`` and spatial-name aliases + ``lat`` / ``lon`` / ``latitude`` / ``longitude`` / ``row`` / + ``col``). A leading non-band dim such as ``time`` is rejected + because the writer cannot infer the band axis from arbitrary + names and used to silently treat the leading axis as ``y`` + (issue #1812). + """ + from .._reader import _coerce_path + + path = _coerce_path(path) + + # tiled=False ignores tile_size, so only validate when tiled output + # is requested. Shared with write_geotiff_gpu via + # _validate_tile_size_arg so both writers keep identical validation. + if tiled: + _validate_tile_size_arg(tile_size) + + # Up-front validation: catch bad compression names before they reach + # any of the deeper write paths (streaming, GPU, VRT, COG) where the + # error surfaces from _compression_tag with a less obvious traceback. + if isinstance(compression, str): + if compression.lower() not in _VALID_COMPRESSIONS: + raise ValueError( + f"Unknown compression {compression!r}. " + f"Valid options: {list(_VALID_COMPRESSIONS)}.") + # JPEG-in-TIFF (compression=7) requires the JPEGTables tag (347) + # carrying the abbreviated quantization/Huffman tables. The + # current encoder writes a self-contained JFIF stream per + # tile/strip and omits JPEGTables, which makes the resulting + # files unreadable by libtiff / GDAL / rasterio: they reject the + # tile data with "TIFFReadEncodedStrip() failed". The internal + # reader round-trips because Pillow re-decodes the JFIF stream + # directly, masking the interop break. Refuse the write rather + # than emit files no other tool can decode. See issue tracking + # the proper JPEGTables fix for re-enabling this codec. + if compression.lower() == 'jpeg': + raise ValueError( + "compression='jpeg' is not supported: the encoder writes " + "self-contained JFIF streams without the required " + "JPEGTables tag (347), so other readers (libtiff, GDAL, " + "rasterio) reject the file. Use 'deflate', 'zstd', or " + "'lzw' instead.") + + # max_z_error only applies to LERC; reject negative values and reject + # non-zero values paired with any other codec so the caller learns the + # parameter was ignored before bytes hit disk. + if max_z_error < 0: + raise ValueError( + f"max_z_error must be >= 0, got {max_z_error}") + if max_z_error != 0 and ( + not isinstance(compression, str) + or compression.lower() != 'lerc'): + raise ValueError( + "max_z_error is only valid with compression='lerc'") + + # File-like (BytesIO etc.) destinations: the streaming, GPU, COG, and + # VRT writers all need a real filesystem path (atomic rename, overview + # passes, sidecar writes). Reject those combos up front so the user + # gets a clear error instead of a deep traceback. + _path_is_file_like = (not isinstance(path, str)) and hasattr(path, 'write') + if _path_is_file_like: + if cog: + raise ValueError( + "cog=True is not supported for file-like destinations. " + "Pass a string path or write to BytesIO without cog=True.") + elif not isinstance(path, str): + raise TypeError( + f"path must be a str or a binary file-like with a write() " + f"method, got {type(path).__name__}") + + _is_vrt_path = ( + isinstance(path, str) and path.lower().endswith('.vrt')) + + # tile_size only applies to tiled output; warn if the caller passed a + # non-default size alongside strip mode (it would otherwise be silently + # ignored). The VRT path always tiles, so the warning would be + # misleading there -- the VRT branch below rejects tiled=False up front + # instead. + if not tiled and tile_size != 256 and not _is_vrt_path: + warnings.warn( + f"tile_size={tile_size} is ignored when tiled=False " + "(strip layout). Pass tiled=True to use tile_size, or drop " + "tile_size to silence this warning.", + stacklevel=2, + ) + + # VRT tiled output (string paths only -- VRT writes a real .vrt file + # plus per-tile GeoTIFFs to a directory) + if _is_vrt_path: + if not tiled: + raise ValueError( + "tiled=False is not compatible with VRT output. " + "VRT writes a directory of tiled GeoTIFFs; pass " + "tiled=True or omit it.") + # The early ``if tiled: _validate_tile_size_arg(tile_size)`` check + # above already validates tile_size when tiled=True, but call it + # here as well so the VRT path stays self-contained against future + # changes to the early-validation gate (no-op on re-entry; either + # raises or returns). + _validate_tile_size_arg(tile_size) + if cog: + raise ValueError( + "cog=True is not compatible with VRT output. " + "VRT writes tiled GeoTIFFs, not a single COG.") + if overview_levels is not None: + raise ValueError( + "overview_levels is not compatible with VRT output. " + "VRT tiles do not include overviews.") + _write_vrt_tiled(data, path, + crs=crs, nodata=nodata, + compression=compression, + compression_level=compression_level, + tile_size=tile_size, + predictor=predictor, + bigtiff=bigtiff, + max_z_error=max_z_error, + photometric=photometric) + return + + # Auto-detect GPU data and dispatch to write_geotiff_gpu. ``gpu is + # None`` is the implicit "use whatever fits the data" path; preserve + # that distinction in the fallback warning below so users who never + # set ``gpu=True`` are not told their explicit request was dropped. + auto_detected_gpu = gpu is None + use_gpu = gpu if gpu is not None else _is_gpu_data(data) + if use_gpu and _path_is_file_like: + # write_geotiff_gpu's nvCOMP path materialises tile parts and then + # calls _write_bytes(path), which would write at the buffer's + # current cursor without truncating. More importantly, the GPU + # path was never tested with file-like destinations; refuse rather + # than silently produce something untested. + raise ValueError( + "gpu=True is not supported for file-like destinations. " + "Pass a string path (or set gpu=False).") + if use_gpu: + # GPU writer uses nvCOMP and does not support LERC; refuse rather + # than silently dropping the requested error budget. + if max_z_error != 0: + raise ValueError( + "max_z_error is not supported on the GPU writer " + "(nvCOMP has no LERC backend). Use the CPU path " + "(gpu=False) or omit max_z_error.") + # Strip output is not implemented on the GPU path; reject up + # front rather than silently producing a tiled file. + if not tiled: + raise ValueError( + "tiled=False is not supported on the GPU writer. " + "Pass gpu=False or omit tiled=False.") + try: + write_geotiff_gpu(data, path, crs=crs, nodata=nodata, + compression=compression, + compression_level=compression_level, + tiled=tiled, + tile_size=tile_size, + predictor=predictor, + cog=cog, + overview_levels=overview_levels, + overview_resampling=overview_resampling, + bigtiff=bigtiff, + streaming_buffer_bytes=streaming_buffer_bytes, + photometric=photometric) + return + except ImportError as e: + # ``write_geotiff_gpu`` raises ImportError when cupy itself + # can't be imported. nvCOMP absence doesn't surface here: + # ``_try_nvcomp_from_device_bufs`` returns None when the + # library can't load, and the writer drops to CPU + # compression internally instead of re-raising. Fall back + # to the CPU writer with a typed warning so callers see + # that gpu=True (or auto-detected CuPy data) didn't go + # through. Strict mode re-raises so CI can fail loudly on + # missing GPU stacks. + if _geotiff_strict_mode(): + raise + warnings.warn( + _gpu_fallback_warning_message(auto_detected_gpu, e), + GeoTIFFFallbackWarning, + stacklevel=2, + ) + except RuntimeError as e: + # Only fall back when the message names a GPU-availability + # problem; any other RuntimeError is a real bug in the GPU + # writer and the broad ``except (ImportError, Exception)`` + # used to hide it from the user. Keep the keyword list + # tight: nvCOMP / CUDA / no device / no GPU / cuInit cover + # the realistic "no GPU present" failure modes without + # masking, e.g., a CRS or compression error that happens to + # raise RuntimeError. Strict mode re-raises in either case. + _gpu_unavail_tokens = ( + 'nvcomp', 'cuda', 'no device', 'no gpu', 'cuinit', + ) + msg = str(e).lower() + if not any(tok in msg for tok in _gpu_unavail_tokens): + raise + if _geotiff_strict_mode(): + raise + warnings.warn( + _gpu_fallback_warning_message(auto_detected_gpu, e), + GeoTIFFFallbackWarning, + stacklevel=2, + ) + + geo_transform = None + epsg = None + wkt_fallback = None # WKT string when EPSG is not available + raster_type = RASTER_PIXEL_IS_AREA + x_res = None + y_res = None + res_unit = None + gdal_meta_xml = None + extra_tags_list = None + + # Resolve crs argument: can be int (EPSG) or str (WKT/PROJ) + if isinstance(crs, int): + epsg = crs + elif isinstance(crs, str): + epsg = _wkt_to_epsg(crs) # try to extract EPSG from WKT/PROJ + if epsg is None: + wkt_fallback = crs + + if isinstance(data, xr.DataArray): + raw = data.data + + # Extract metadata from DataArray attrs (no materialisation needed). + # Prefer attrs['transform'] (from open_geotiff) over the coord-derived + # transform: that path is bit-stable across round-trips, while + # _coords_to_transform can drift on fractional pixel sizes because + # x[1] - x[0] is computed in float64 from already-rounded coords. + if geo_transform is None: + geo_transform = _transform_from_attr(data.attrs.get('transform')) + if geo_transform is None: + geo_transform = _coords_to_transform(data) + if epsg is None and crs is None: + crs_attr = data.attrs.get('crs') + if isinstance(crs_attr, str): + epsg = _wkt_to_epsg(crs_attr) + if epsg is None and wkt_fallback is None: + wkt_fallback = crs_attr + elif crs_attr is not None: + epsg = int(crs_attr) + if epsg is None: + wkt = data.attrs.get('crs_wkt') + if isinstance(wkt, str): + epsg = _wkt_to_epsg(wkt) + if epsg is None and wkt_fallback is None: + wkt_fallback = wkt + if nodata is None: + nodata = _resolve_nodata_attr(data.attrs) + # Pull raster_type, gdal_metadata_xml, extra_tags (folded with + # the friendly image_description / extra_samples / colormap + # attrs), x/y_resolution, and resolution_unit via the shared + # helper so all three writers stay in lockstep. + _rich = _extract_rich_tags(data.attrs) + raster_type = _rich['raster_type'] + gdal_meta_xml = _rich['gdal_metadata_xml'] + extra_tags_list = _rich['extra_tags'] + x_res = _rich['x_resolution'] + y_res = _rich['y_resolution'] + res_unit = _rich['resolution_unit'] + + # Dask-backed: stream tiles to avoid materialising the full array. + # COG requires overviews from the full array, so it falls through + # to the eager path. Streaming write needs a real filesystem path + # (it builds a temp file then atomic-renames); for file-like + # destinations we materialise eagerly and assemble in-memory. + if hasattr(raw, 'dask') and not cog and not _path_is_file_like: + dask_arr = raw + # Reject ambiguous 3D layouts at the entry point so a leading + # non-band dim like ``('time', 'y', 'x')`` cannot silently + # round-trip as a TIFF whose ``y`` axis carries the time + # values (issue #1812). + if raw.ndim == 3: + _validate_3d_writer_dims(data.dims) + # Handle band-first dimension order (band, y, x) -> (y, x, band) + if raw.ndim == 3 and data.dims[0] in _BAND_DIM_NAMES: + import dask.array as da + dask_arr = da.moveaxis(raw, 0, -1) + if dask_arr.ndim not in (2, 3): + raise ValueError( + f"Expected 2D or 3D array, got {dask_arr.ndim}D") + # Validate compression_level + if compression_level is not None: + level_range = _LEVEL_RANGES.get(compression.lower()) + if level_range is not None: + lo, hi = level_range + if not (lo <= compression_level <= hi): + raise ValueError( + f"compression_level={compression_level} out of " + f"range for {compression} (valid: {lo}-{hi})") + from .._writer import write_streaming + write_streaming( + dask_arr, path, + geo_transform=geo_transform, + crs_epsg=epsg, + crs_wkt=wkt_fallback if epsg is None else None, + nodata=nodata, + compression=compression, + compression_level=compression_level, + tiled=tiled, + tile_size=tile_size, + predictor=predictor, + raster_type=raster_type, + x_resolution=x_res, + y_resolution=y_res, + resolution_unit=res_unit, + gdal_metadata_xml=gdal_meta_xml, + extra_tags=extra_tags_list, + bigtiff=bigtiff, + streaming_buffer_bytes=streaming_buffer_bytes, + max_z_error=max_z_error, + photometric=photometric, + ) + return + + # Eager compute (numpy, CuPy, or dask+COG) + if hasattr(raw, 'get'): + arr = raw.get() # CuPy -> numpy + elif hasattr(raw, 'compute'): + arr = raw.compute() # Dask -> numpy + if hasattr(arr, 'get'): + arr = arr.get() # Dask+CuPy -> numpy + else: + arr = np.asarray(raw) + # Reject ambiguous 3D layouts (issue #1812). The validator runs + # on ``data.dims`` (the original DataArray's dim names) rather + # than on ``arr`` so the error fires before the move-axis even + # for COG and file-like destinations that fall through here. + if arr.ndim == 3: + _validate_3d_writer_dims(data.dims) + # Handle band-first dimension order (band, y, x) -> (y, x, band) + if arr.ndim == 3 and data.dims[0] in _BAND_DIM_NAMES: + arr = np.moveaxis(arr, 0, -1) + else: + if hasattr(data, 'get'): + arr = data.get() # CuPy -> numpy + else: + arr = np.asarray(data) + + if arr.ndim not in (2, 3): + raise ValueError(f"Expected 2D or 3D array, got {arr.ndim}D") + + # Auto-promote unsupported dtypes + if arr.dtype == np.float16: + arr = arr.astype(np.float32) + elif arr.dtype == np.bool_: + arr = arr.astype(np.uint8) + + # Restore NaN pixels to the nodata sentinel value so the written file + # has sentinel values matching the GDAL_NODATA tag. + # + # The defensive ``arr.copy()`` here is intentional: ``arr`` may be + # ``np.asarray(raw)`` of a caller-owned numpy DataArray (a view, + # not a copy) or ``np.moveaxis(arr, 0, -1)`` of one (also a view). + # Mutating without a copy would corrupt the user's input buffer. + if nodata is not None and arr.dtype.kind == 'f' and not np.isnan(nodata): + nan_mask = np.isnan(arr) + if nan_mask.any(): + arr = arr.copy() + arr[nan_mask] = arr.dtype.type(nodata) + + # Validate compression_level against codec-specific range + if compression_level is not None: + level_range = _LEVEL_RANGES.get(compression.lower()) + if level_range is not None: + lo, hi = level_range + if not (lo <= compression_level <= hi): + raise ValueError( + f"compression_level={compression_level} out of range " + f"for {compression} (valid: {lo}-{hi})") + + write( + arr, path, + geo_transform=geo_transform, + crs_epsg=epsg, + crs_wkt=wkt_fallback if epsg is None else None, + nodata=nodata, + compression=compression, + compression_level=compression_level, + tiled=tiled, + tile_size=tile_size, + predictor=predictor, + cog=cog, + overview_levels=overview_levels, + overview_resampling=overview_resampling, + raster_type=raster_type, + x_resolution=x_res, + y_resolution=y_res, + resolution_unit=res_unit, + gdal_metadata_xml=gdal_meta_xml, + extra_tags=extra_tags_list, + bigtiff=bigtiff, + max_z_error=max_z_error, + photometric=photometric, + ) + + +def _write_single_tile(chunk_data, path, geo_transform, epsg, wkt, + nodata, compression, compression_level, + tile_size, predictor, bigtiff, + max_z_error: float = 0.0, + raster_type: int = RASTER_PIXEL_IS_AREA, + x_resolution=None, + y_resolution=None, + resolution_unit=None, + gdal_metadata_xml=None, + extra_tags=None, + photometric: str | int = 'auto'): + """Write a single tile GeoTIFF. Used by _write_vrt_tiled. + + Forwards the same rich-tag set that ``to_geotiff`` passes through to + ``write`` (raster_type, x/y resolution, GDAL metadata, extra tags) so + every per-tile file under a VRT carries the same metadata it would + have received from a single-file ``to_geotiff(..., out.tif)`` write. + Without this, ``to_geotiff(da, "out.vrt")`` silently drops everything + except the per-tile geo_transform / crs / nodata. See issue #1606. + """ + if hasattr(chunk_data, 'compute'): + chunk_data = chunk_data.compute() + if hasattr(chunk_data, 'get'): + chunk_data = chunk_data.get() # CuPy -> numpy + + arr = np.asarray(chunk_data) + + # Auto-promote unsupported dtypes + if arr.dtype == np.float16: + arr = arr.astype(np.float32) + elif arr.dtype == np.bool_: + arr = arr.astype(np.uint8) + + # Restore NaN to nodata sentinel. + # + # The defensive ``arr.copy()`` here is intentional: ``arr`` came + # from ``np.asarray(chunk_data)`` where ``chunk_data`` may be a + # caller-owned numpy buffer. Mutating without a copy would corrupt + # the user's input. + if nodata is not None and arr.dtype.kind == 'f' and not np.isnan(nodata): + nan_mask = np.isnan(arr) + if nan_mask.any(): + arr = arr.copy() + arr[nan_mask] = arr.dtype.type(nodata) + + write(arr, path, + geo_transform=geo_transform, + crs_epsg=epsg, + crs_wkt=wkt if epsg is None else None, + nodata=nodata, + compression=compression, + tiled=True, + tile_size=tile_size, + predictor=predictor, + compression_level=compression_level, + raster_type=raster_type, + x_resolution=x_resolution, + y_resolution=y_resolution, + resolution_unit=resolution_unit, + gdal_metadata_xml=gdal_metadata_xml, + extra_tags=extra_tags, + bigtiff=bigtiff, + max_z_error=max_z_error, + photometric=photometric) + + +def _write_vrt_tiled(data, vrt_path, *, crs=None, nodata=None, + compression='zstd', compression_level=None, + tile_size=256, predictor: bool | int = False, + bigtiff=None, max_z_error: float = 0.0, + photometric: str | int = 'auto'): + """Write a DataArray as a directory of tiled GeoTIFFs with a VRT index. + + This enables streaming dask arrays to disk without materializing the + full array in RAM. + """ + import os + + # Validate compression_level against codec-specific range + if compression_level is not None: + level_range = _LEVEL_RANGES.get(compression.lower()) + if level_range is not None: + lo, hi = level_range + if not (lo <= compression_level <= hi): + raise ValueError( + f"compression_level={compression_level} out of range " + f"for {compression} (valid: {lo}-{hi})") + + # Derive tiles directory from VRT path stem + vrt_dir = os.path.dirname(os.path.abspath(vrt_path)) + stem = os.path.splitext(os.path.basename(vrt_path))[0] + tiles_dir_name = stem + '_tiles' + tiles_dir = os.path.join(vrt_dir, tiles_dir_name) + + # Validate tiles directory + if os.path.isdir(tiles_dir) and os.listdir(tiles_dir): + raise FileExistsError( + f"Tiles directory already contains files: {tiles_dir}") + os.makedirs(tiles_dir, exist_ok=True) + + # Resolve CRS + epsg = None + wkt_fallback = None + if isinstance(crs, int): + epsg = crs + elif isinstance(crs, str): + epsg = _wkt_to_epsg(crs) + if epsg is None: + wkt_fallback = crs + + geo_transform = None + raster_type = RASTER_PIXEL_IS_AREA + x_res = None + y_res = None + res_unit = None + gdal_meta_xml = None + extra_tags_list = None + + if isinstance(data, xr.DataArray): + raw = data.data + if epsg is None and crs is None: + crs_attr = data.attrs.get('crs') + if isinstance(crs_attr, str): + epsg = _wkt_to_epsg(crs_attr) + if epsg is None and wkt_fallback is None: + wkt_fallback = crs_attr + elif crs_attr is not None: + epsg = int(crs_attr) + if epsg is None: + wkt = data.attrs.get('crs_wkt') + if isinstance(wkt, str): + epsg = _wkt_to_epsg(wkt) + if epsg is None and wkt_fallback is None: + wkt_fallback = wkt + if nodata is None: + # Use the same alias-aware resolver that to_geotiff / + # write_geotiff_gpu apply so a rioxarray-style DataArray + # (``attrs['nodatavals']``) or a CF-style one + # (``attrs['_FillValue']``) round-trips through ``.vrt`` + # the same way it does through ``.tif``. Before this fix + # the VRT path used ``attrs.get('nodata')`` directly and + # silently dropped both aliases (issue #1606). + nodata = _resolve_nodata_attr(data.attrs) + geo_transform = _transform_from_attr(data.attrs.get('transform')) + if geo_transform is None: + geo_transform = _coords_to_transform(data) + # Pull the same rich-tag set that to_geotiff forwards to + # ``write`` so per-tile files under the VRT carry it too. + _rich = _extract_rich_tags(data.attrs) + raster_type = _rich['raster_type'] + gdal_meta_xml = _rich['gdal_metadata_xml'] + extra_tags_list = _rich['extra_tags'] + x_res = _rich['x_resolution'] + y_res = _rich['y_resolution'] + res_unit = _rich['resolution_unit'] + else: + raw = data + + # Check for dask backing + is_dask = hasattr(raw, 'dask') + + if is_dask: + if raw.ndim != 2: + raise ValueError( + "VRT tiled output currently supports 2D arrays only, " + f"got {raw.ndim}D. Squeeze or select a band first.") + # Use dask chunk grid + import dask + row_chunks = raw.chunks[0] # tuple of chunk sizes along y + col_chunks = raw.chunks[1] # tuple of chunk sizes along x + n_row_tiles = len(row_chunks) + n_col_tiles = len(col_chunks) + else: + # Numpy: tile using tile_size + if hasattr(raw, 'get'): + np_arr = raw.get() # CuPy + elif hasattr(raw, 'compute'): + np_arr = raw.compute() + else: + np_arr = np.asarray(raw) + if np_arr.ndim != 2: + raise ValueError( + "VRT tiled output currently supports 2D arrays only, " + f"got {np_arr.ndim}D. Squeeze or select a band first.") + height, width = np_arr.shape[:2] + n_row_tiles = (height + tile_size - 1) // tile_size + n_col_tiles = (width + tile_size - 1) // tile_size + + # Zero-padding width for tile names + pad_width = max(2, len(str(max(n_row_tiles, n_col_tiles) - 1))) + + tile_paths = [] + delayed_tasks = [] + + row_offset = 0 + for ri in range(n_row_tiles): + if is_dask: + chunk_h = row_chunks[ri] + else: + chunk_h = min(tile_size, height - row_offset) + + col_offset = 0 + for ci in range(n_col_tiles): + if is_dask: + chunk_w = col_chunks[ci] + else: + chunk_w = min(tile_size, width - col_offset) + + tile_name = f'tile_{ri:0{pad_width}d}_{ci:0{pad_width}d}.tif' + tile_path = os.path.join(tiles_dir, tile_name) + tile_paths.append(tile_path) + + # Compute per-tile geo_transform + tile_gt = None + if geo_transform is not None: + tile_gt = GeoTransform( + origin_x=geo_transform.origin_x + col_offset * geo_transform.pixel_width, + origin_y=geo_transform.origin_y + row_offset * geo_transform.pixel_height, + pixel_width=geo_transform.pixel_width, + pixel_height=geo_transform.pixel_height, + ) + + if is_dask: + # Slice the dask array for this chunk + r_end = row_offset + chunk_h + c_end = col_offset + chunk_w + chunk_data = raw[row_offset:r_end, col_offset:c_end] + + task = dask.delayed(_write_single_tile)( + chunk_data, tile_path, tile_gt, epsg, wkt_fallback, + nodata, compression, compression_level, + tile_size, predictor, bigtiff, max_z_error, + raster_type=raster_type, + x_resolution=x_res, + y_resolution=y_res, + resolution_unit=res_unit, + gdal_metadata_xml=gdal_meta_xml, + extra_tags=extra_tags_list, + photometric=photometric) + delayed_tasks.append(task) + else: + # Numpy: slice and write directly + chunk_data = np_arr[row_offset:row_offset + chunk_h, + col_offset:col_offset + chunk_w] + _write_single_tile( + chunk_data, tile_path, tile_gt, epsg, wkt_fallback, + nodata, compression, compression_level, + tile_size, predictor, bigtiff, max_z_error, + raster_type=raster_type, + x_resolution=x_res, + y_resolution=y_res, + resolution_unit=res_unit, + gdal_metadata_xml=gdal_meta_xml, + extra_tags=extra_tags_list, + photometric=photometric) + + col_offset += chunk_w + row_offset += chunk_h + + # Execute all dask tasks. + # + # Each delayed task is an independent ``_write_single_tile`` call on + # a distinct output path, with no shared mutable Python state, so + # the writes are embarrassingly parallel. Using ``scheduler='threads'`` + # lets zlib / zstd / LZW release the GIL during compression and the + # OS coalesce concurrent writes; in a 256-tile zstd write on a + # 4096x4096 dask DataArray the wall time drops ~33% versus the + # ``synchronous`` scheduler this used to call (issue #1714). + if delayed_tasks: + import dask + dask.compute(*delayed_tasks, scheduler='threads') + + # Write VRT index with relative paths + from .._vrt import write_vrt as _write_vrt_fn + _write_vrt_fn(vrt_path, tile_paths, relative=True, nodata=nodata) + diff --git a/xrspatial/geotiff/_writers/gpu.py b/xrspatial/geotiff/_writers/gpu.py new file mode 100644 index 00000000..43faa662 --- /dev/null +++ b/xrspatial/geotiff/_writers/gpu.py @@ -0,0 +1,513 @@ +"""GPU writer entry point. + +Step 10 of issue #1813. Holds ``write_geotiff_gpu``, which compresses +tiles on the device via nvCOMP (when available) and falls back to the +eager CPU writer when nvCOMP is missing or the device path raises +under ``on_gpu_failure='auto'``. +""" +from __future__ import annotations + +import math +import os +import warnings +from typing import TYPE_CHECKING + +import numpy as np +import xarray as xr + +if TYPE_CHECKING: + from typing import BinaryIO + +from .._attrs import _extract_rich_tags, _resolve_nodata_attr +from .._coords import ( + _BAND_DIM_NAMES, + coords_to_transform as _coords_to_transform, + transform_from_attr as _transform_from_attr, +) +from .._crs import _wkt_to_epsg +from .._geotags import GeoTransform +from .._runtime import GeoTIFFFallbackWarning +from .._validation import ( + _validate_3d_writer_dims, + _validate_dtype_cast, + _validate_tile_size_arg, +) +from .._writer import write + + +def write_geotiff_gpu(data: xr.DataArray | cupy.ndarray | np.ndarray, + path: str | BinaryIO, *, + crs: int | str | None = None, + nodata: float | int | None = None, + compression: str = 'zstd', + compression_level: int | None = None, + tiled: bool = True, + tile_size: int = 256, + predictor: bool | int = False, + cog: bool = False, + overview_levels: list[int] | None = None, + overview_resampling: str = 'mean', + bigtiff: bool | None = None, + max_z_error: float = 0.0, + streaming_buffer_bytes: int = 256 * 1024 * 1024, + photometric: str | int = 'auto', + allow_internal_only_jpeg: bool = False) -> None: + """Write a CuPy-backed DataArray as a GeoTIFF with GPU compression. + + Tiles are extracted and compressed on the GPU via nvCOMP, then + assembled into a TIFF file on CPU. The CuPy array stays on device + throughout compression -- only the compressed bytes transfer to CPU + for file writing. + + When ``cog=True``, generates overview pyramids on GPU and writes a + Cloud Optimized GeoTIFF with all IFDs at the file start for + efficient range-request access. + + Falls back to CPU compression if nvCOMP is not available. + + Parameters + ---------- + data : xr.DataArray (CuPy- or NumPy-backed), cupy.ndarray, or np.ndarray + 2D or 3D raster. CuPy-backed inputs stay on device; NumPy/Dask + inputs are uploaded via ``cupy.asarray(np.asarray(data))`` + before compression (matches ``to_geotiff`` parity). + path : str or binary file-like + Output file path or any object with a ``write`` method + (e.g. ``io.BytesIO``). ``cog=True`` requires a string path: + the auto-dispatch path through ``to_geotiff(gpu=True, cog=True)`` + rejects file-like destinations, and the explicit GPU writer + mirrors that rule (issue #1652). + crs : int, str, or None + EPSG code or WKT string. EPSG codes are strongly preferred for + interop; the WKT-only path emits a user-defined CRS (32767) with + the WKT stored in ``GTCitationGeoKey``, which many non-libgeotiff + readers ignore. A ``UserWarning`` is emitted when the WKT-only + path is taken. See issue #1768. + nodata : float, int, or None + NoData value. + compression : str + Codec name. Accepts the same set ``to_geotiff`` lists in its + own signature: ``'none'``, ``'deflate'``, ``'lzw'``, ``'jpeg'``, + ``'packbits'``, ``'zstd'``, ``'lz4'``, ``'jpeg2000'`` (alias + ``'j2k'``), or ``'lerc'``. + + Routing per codec: + + - ``'zstd'`` (default) and ``'deflate'``: nvCOMP batch + compression on the GPU -- the fastest paths and the reason to + use this entry point. + - ``'jpeg'``: rejected by default for parity with + ``to_geotiff``. Both writers emit self-contained JFIF tiles + and skip the required TIFF JPEGTables tag (347), so the + resulting files are unreadable by libtiff, GDAL, and + rasterio. Pass ``allow_internal_only_jpeg=True`` to opt in + to the experimental encode path; the writer then routes to + nvJPEG when libnvjpeg is loadable and falls back to Pillow + otherwise, and emits a ``GeoTIFFFallbackWarning`` so the + caller knows the output will not round-trip through external + readers. See issue #1845. + - ``'jpeg2000'`` and ``'j2k'``: nvJPEG2K GPU encode when + available, glymur CPU encode otherwise. The two paths are + not byte-for-byte identical (different libraries, different + default parameters); use ``to_geotiff`` if you need exact + CPU-writer parity. + - ``'lerc'``, ``'lzw'``, ``'packbits'``, and ``'lz4'``: no + nvCOMP/CUDA accelerator, so these fall through to the CPU + encoder for byte-stable parity with ``to_geotiff``. + compression_level : int or None + Compression effort level. Accepted for API compatibility but + currently ignored -- nvCOMP does not expose level control. + tiled : bool + Must be True (default). The GPU writer is tiled-only because + nvCOMP batch compression operates on per-tile streams; passing + ``tiled=False`` raises ``ValueError`` rather than silently + producing a tiled file. Accepted for API parity with + ``to_geotiff``. + tile_size : int + Tile size in pixels (default 256). Must be a positive multiple + of 16; this is a TIFF 6 spec requirement on TileWidth and + TileLength for broad reader compatibility. ``write_geotiff_gpu`` + is always tiled, so the check fires for every call. + predictor : bool or int + TIFF predictor. ``False``/``0``/``1`` -> none, ``True``/``2`` -> + horizontal differencing, ``3`` -> floating-point predictor + (float dtypes only). + cog : bool + Write as Cloud Optimized GeoTIFF with overviews. + overview_levels : list[int] or None + Overview decimation factors relative to full resolution. + Each entry must be a power-of-two integer >= 2, and the list + must be strictly increasing (e.g. ``[2, 4, 8]`` writes + overviews at 1/2, 1/4 and 1/8 of the full resolution). + Invalid values raise ``ValueError``. Only used when ``cog=True``. + If None and ``cog=True``, auto-generates ``[2, 4, 8, ...]`` by + halving until the smallest overview fits in a single tile. + overview_resampling : str + Resampling method for overviews: 'mean' (default), 'nearest', + 'min', 'max', 'median', 'mode', or 'cubic'. ``mode`` and + ``cubic`` fall back to the CPU implementation in + ``xrspatial.geotiff._writer`` so the GPU writer produces the + same overview bytes as the CPU writer. + bigtiff : bool or None + Force BigTIFF (64-bit offsets). None auto-promotes when the + estimated file size would exceed the classic-TIFF 4 GB limit. + max_z_error : float + Per-pixel error budget for LERC compression. The GPU writer + does not implement LERC (nvCOMP has no LERC backend), so any + non-zero value raises ``ValueError``. Accepted at the signature + level for API parity with ``to_geotiff``. + streaming_buffer_bytes : int + Accepted for API parity with ``to_geotiff``. The GPU writer + materialises the entire array on device and has no streaming + concept, so this kwarg is a no-op. Default matches + ``to_geotiff`` (256 MB) so callers passing the same kwargs to + either entry point see the same default and the same type. + photometric : str or int + Photometric interpretation for the TIFF Photometric tag (262). + See :func:`to_geotiff` for the full set of accepted values; the + GPU writer forwards this kwarg unchanged. Default ``'auto'`` + writes MinIsBlack for any band count, so a 4-band raster is + not silently tagged as RGB+alpha (issue #1769). + allow_internal_only_jpeg : bool + Opt in to the experimental ``compression='jpeg'`` encode path + (default ``False``). The encoder emits self-contained JFIF + tiles without the TIFF JPEGTables tag (347); the file decodes + through this library's reader but not through libtiff, GDAL, + or rasterio. With the flag set, the write proceeds and a + ``GeoTIFFFallbackWarning`` is emitted at call time. Without + the flag, ``compression='jpeg'`` raises ``ValueError`` for + parity with ``to_geotiff``. See issue #1845. + + Raises + ------ + ValueError + If ``data`` is a 3D DataArray whose ``dims`` is not + ``(band, y, x)`` or ``(y, x, band)`` (accepting band-name + aliases ``bands`` / ``channel`` and spatial-name aliases + ``lat`` / ``lon`` / ``row`` / ``col``). A leading non-band + dim such as ``time`` would otherwise silently round-trip with + the leading axis treated as ``y`` (issue #1812). + """ + if not tiled: + raise ValueError( + "write_geotiff_gpu requires tiled=True. nvCOMP batch " + "compression is tile-based; the strip layout is not " + "implemented on the GPU path. Use to_geotiff(..., gpu=False, " + "tiled=False) for strip output on CPU.") + # JPEG-in-TIFF parity with to_geotiff (issue #1845). The GPU encode + # path writes self-contained JFIF tiles without the TIFF JPEGTables + # tag (347), matching the broken CPU encoder. ``to_geotiff`` refuses + # the codec outright; this writer offered no rejection at all, so + # callers could produce GeoTIFFs that decoded through xrspatial but + # broke in libtiff/GDAL/rasterio. Reject by default with the same + # wording so both entry points agree, and surface an opt-in flag for + # users who explicitly want the internal-only path. + if (isinstance(compression, str) + and compression.lower() == 'jpeg' + and not allow_internal_only_jpeg): + raise ValueError( + "compression='jpeg' is not supported: the encoder writes " + "self-contained JFIF streams without the required " + "JPEGTables tag (347), so other readers (libtiff, GDAL, " + "rasterio) reject the file. Use 'deflate', 'zstd', or " + "'lzw' instead. Pass allow_internal_only_jpeg=True to opt " + "in to the experimental internal-reader-only path (issue " + "#1845).") + if (isinstance(compression, str) + and compression.lower() == 'jpeg' + and allow_internal_only_jpeg): + warnings.warn( + "write_geotiff_gpu(compression='jpeg', " + "allow_internal_only_jpeg=True) writes JFIF tiles without " + "the TIFF JPEGTables tag (347); the file decodes through " + "xrspatial but may fail in libtiff, GDAL, or rasterio. " + "See issue #1845.", + GeoTIFFFallbackWarning, + stacklevel=2, + ) + # MinIsWhite pre-inversion (issue #1836) runs in the eager CPU writer. + # The GPU writer assembles tile bytes directly on device; threading + # the pixel + nodata-sentinel transform through that pipeline is out + # of scope for the round-trip fix. Refuse the combination so callers + # do not silently get inverted on-disk values. Move the array to the + # CPU and call the eager ``write`` path for MinIsWhite output. + from .._writer import _resolve_photometric as _resolve_photo_gpu + _gpu_samples_hint = (data.shape[2] if hasattr(data, 'shape') + and data.ndim == 3 else 1) + _gpu_resolved_photo, _ = _resolve_photo_gpu( + photometric, _gpu_samples_hint) + if _gpu_resolved_photo == 0 and _gpu_samples_hint == 1: + raise NotImplementedError( + "photometric='miniswhite' on the GPU writer is not " + "supported: the writer-side pixel inversion that mirrors " + "the reader's unconditional MinIsWhite inversion (issue " + "#1836) is only wired into the eager CPU ``write`` path. " + "Move the array to host memory and call to_geotiff with " + "gpu=False, or write with photometric='minisblack' / " + "'auto'.") + # write_geotiff_gpu is always tiled, so validate tile_size here and + # keep parity with the public to_geotiff entry point. + _validate_tile_size_arg(tile_size) + if max_z_error < 0: + raise ValueError( + f"max_z_error must be >= 0, got {max_z_error}") + if max_z_error != 0: + raise ValueError( + "max_z_error is not supported on the GPU writer " + "(nvCOMP has no LERC backend). Use to_geotiff(..., gpu=False) " + "or omit max_z_error.") + # Mirror to_geotiff's path-type + cog=True gating verbatim so callers + # see identical errors from the two entry points. The auto-dispatch + # path through ``to_geotiff(gpu=True, cog=True, path=BytesIO)`` raises + # before reaching here; the explicit GPU writer mirrors the same gate + # so callers cannot bypass it (issue #1652). Non-cog file-like writes + # remain supported on this entry point. + _path_is_file_like = ( + not isinstance(path, str)) and hasattr(path, 'write') + if _path_is_file_like: + if cog: + raise ValueError( + "cog=True is not supported for file-like destinations. " + "Pass a string path or write to BytesIO without cog=True.") + elif not isinstance(path, str): + raise TypeError( + f"path must be a str or a binary file-like with a write() " + f"method, got {type(path).__name__}") + # streaming_buffer_bytes is intentionally a no-op on the GPU path; + # the kwarg exists for API parity with to_geotiff so callers can pass + # the same kwargs to both entry points without filtering. + del streaming_buffer_bytes + try: + import cupy + except ImportError: + raise ImportError("cupy is required for GPU writes") + + from .._gpu_decode import gpu_compress_tiles, make_overview_gpu + from .._writer import ( + _compression_tag, _assemble_tiff, _write_bytes, + normalize_predictor, + GeoTransform as _GT, + ) + from .._dtypes import numpy_to_tiff_dtype + + # Extract array and metadata + geo_transform = None + epsg = None + wkt_fallback = None # WKT string when EPSG is not available + raster_type = 1 + gdal_meta_xml = None + extra_tags_list = None + x_res = None + y_res = None + res_unit = None + + if isinstance(crs, int): + epsg = crs + elif isinstance(crs, str): + epsg = _wkt_to_epsg(crs) + if epsg is None: + wkt_fallback = crs + + if isinstance(data, xr.DataArray): + arr = data.data + # Handle Dask arrays: compute to materialize + if hasattr(arr, 'compute'): + arr = arr.compute() + # Now arr should be CuPy or numpy + if hasattr(arr, 'get'): + pass # CuPy array, already on GPU + else: + arr = cupy.asarray(np.asarray(arr)) # numpy -> GPU + + # Reject ambiguous 3D layouts (issue #1812). Mirrors the gate + # in ``to_geotiff``: a leading non-band dim like ``time`` would + # otherwise round-trip with the leading axis silently treated + # as ``y``. + if arr.ndim == 3: + _validate_3d_writer_dims(data.dims) + # Handle band-first dimension order (band, y, x) -> (y, x, band). + # rioxarray and CF-style multi-band rasters land here; without + # this remap the writer treats arr.shape[2] as the band axis and + # produces a transposed file (issue #1580). The CPU writer does + # the same remap at the matching step in to_geotiff(). + if arr.ndim == 3 and data.dims[0] in _BAND_DIM_NAMES: + arr = cupy.ascontiguousarray(cupy.moveaxis(arr, 0, -1)) + + # Prefer attrs['transform'] over the coord-derived transform: it + # is bit-stable across round-trips, while _coords_to_transform + # can drift on fractional pixel sizes (the same reasoning the + # CPU to_geotiff path applies for issue #1484). + geo_transform = _transform_from_attr(data.attrs.get('transform')) + if geo_transform is None: + geo_transform = _coords_to_transform(data) + # Resolve CRS the same way the CPU writer does. attrs['crs'] may + # be an int EPSG or a WKT string; attrs['crs_wkt'] only carries + # WKT. Without the WKT branch the GPU writer silently drops CRS + # on files whose original CRS only resolves to WKT (no recognized + # EPSG). + if epsg is None and crs is None: + crs_attr = data.attrs.get('crs') + if isinstance(crs_attr, str): + epsg = _wkt_to_epsg(crs_attr) + if epsg is None and wkt_fallback is None: + wkt_fallback = crs_attr + elif crs_attr is not None: + epsg = int(crs_attr) + if epsg is None: + wkt = data.attrs.get('crs_wkt') + if isinstance(wkt, str): + epsg = _wkt_to_epsg(wkt) + if epsg is None and wkt_fallback is None: + wkt_fallback = wkt + if nodata is None: + nodata = _resolve_nodata_attr(data.attrs) + # Mirror the CPU writer's pass-through of GDAL metadata, the + # extra_tags list, the friendly image_description / extra_samples + # / colormap synthesis, and the resolution tags. Without these, + # a GPU write -> CPU read round-trip silently drops every rich + # tag (#1563). + _rich = _extract_rich_tags(data.attrs) + raster_type = _rich['raster_type'] + gdal_meta_xml = _rich['gdal_metadata_xml'] + extra_tags_list = _rich['extra_tags'] + x_res = _rich['x_resolution'] + y_res = _rich['y_resolution'] + res_unit = _rich['resolution_unit'] + else: + if hasattr(data, 'compute'): + data = data.compute() # Dask -> CuPy or numpy + if hasattr(data, 'device'): + arr = data # already CuPy + elif hasattr(data, 'get'): + arr = data # CuPy + else: + arr = cupy.asarray(np.asarray(data)) # numpy/list -> GPU + + if arr.ndim not in (2, 3): + raise ValueError(f"Expected 2D or 3D array, got {arr.ndim}D") + + height, width = arr.shape[:2] + samples = arr.shape[2] if arr.ndim == 3 else 1 + np_dtype = np.dtype(str(arr.dtype)) # cupy dtype -> numpy dtype + + # Mirror the CPU writer's NaN-to-sentinel substitution (issue #1599). + # Without this step the GPU writer emits raw NaN bytes interleaved + # with valid data even when ``nodata=`` is supplied; the + # GDAL_NODATA tag still advertises the sentinel but external readers + # (rasterio / GDAL / QGIS) mask only on the sentinel value and + # therefore see the NaN pixels as valid data. The CPU writer does + # the equivalent rewrite at ``to_geotiff`` (lines around + # ``arr.copy(); arr[nan_mask] = arr.dtype.type(nodata)``); both + # paths must produce byte-equivalent files for the same input. + # We always copy before the in-place sentinel write. Some upstream + # branches above already produce a fresh buffer (``cupy.asarray`` + # from numpy/dask, ``ascontiguousarray`` from the band-first + # moveaxis); others (a CuPy-backed DataArray taking the no-moveaxis + # path, or a plain CuPy positional ``data``) hand ``arr`` back as + # the caller's buffer. Rather than tracking provenance across that + # branch tree, copy unconditionally when we are about to mutate -- + # the cost is one GPU array allocation, only on the NaN-present + # path, and it guarantees the CPU writer's defensive-copy semantics + # in every case. + if (nodata is not None + and np_dtype.kind == 'f' + and not np.isnan(float(nodata))): + nan_mask = cupy.isnan(arr) + if bool(nan_mask.any()): + arr = arr.copy() + arr[nan_mask] = np_dtype.type(nodata) + + comp_tag = _compression_tag(compression) + pred_val = normalize_predictor(predictor, np_dtype, comp_tag) + + def _gpu_compress_to_part(gpu_arr, w, h, spp): + """Compress a GPU array into a (stub, w, h, offsets, counts, tiles) tuple.""" + compressed = gpu_compress_tiles( + gpu_arr, tile_size, tile_size, w, h, + comp_tag, pred_val, np_dtype, spp) + rel_off = [] + bc = [] + off = 0 + for tile in compressed: + rel_off.append(off) + bc.append(len(tile)) + off += len(tile) + stub = np.empty((1, 1, spp) if spp > 1 else (1, 1), dtype=np_dtype) + return (stub, w, h, rel_off, bc, compressed) + + # Full resolution + parts = [_gpu_compress_to_part(arr, width, height, samples)] + + # Overview generation -- mirrors the CPU writer's 8-level cap. + if cog: + if overview_levels is None: + from .._writer import _MAX_OVERVIEW_LEVELS + # Auto-generated lists hold actual decimation factors (2, + # 4, 8, ...) so the loop below treats auto-generated and + # user-supplied lists identically (issue #1766). + overview_levels = [] + oh, ow = height, width + factor = 2 + while (oh > tile_size and ow > tile_size and + len(overview_levels) < _MAX_OVERVIEW_LEVELS): + oh //= 2 + ow //= 2 + if oh > 0 and ow > 0: + overview_levels.append(factor) + factor *= 2 + else: + # Validate explicit lists: power-of-two factors >= 2, + # strictly increasing, feasible for the input shape. + # Previously the values were ignored and only the list + # length mattered (issue #1766). + from .._writer import _validate_overview_levels + overview_levels = _validate_overview_levels( + overview_levels, height=height, width=width) + + # Pass ``nodata`` so the GPU reducer masks the sentinel back to + # NaN before averaging. Without this, the NaN->sentinel rewrite + # done above on ``arr`` leaks the sentinel into the overview + # reduction and poisons the pyramid (issue #1613). Rewrite any + # all-sentinel cell NaN back to the sentinel after each level + # so the on-disk overview tiles still carry the sentinel value + # external readers expect. + current = arr + cumulative_factor = 1 + for target_factor in overview_levels: + # Halve repeatedly until the cumulative decimation matches + # the requested factor. Validation has already established + # that ``target_factor`` is a power of two and strictly + # greater than ``cumulative_factor``. + while cumulative_factor < target_factor: + current = make_overview_gpu(current, method=overview_resampling, + nodata=nodata) + cumulative_factor *= 2 + if (nodata is not None + and np.dtype(str(current.dtype)).kind == 'f' + and not np.isnan(float(nodata))): + nan_mask = cupy.isnan(current) + if bool(nan_mask.any().item()): + current = current.copy() + current[nan_mask] = np.dtype( + str(current.dtype)).type(nodata) + oh, ow = current.shape[:2] + parts.append(_gpu_compress_to_part(current, ow, oh, samples)) + + file_bytes = _assemble_tiff( + width, height, np_dtype, comp_tag, pred_val, True, tile_size, + parts, geo_transform, epsg, nodata, + is_cog=(cog and len(parts) > 1), + raster_type=raster_type, + crs_wkt=wkt_fallback if epsg is None else None, + gdal_metadata_xml=gdal_meta_xml, + extra_tags=extra_tags_list, + x_resolution=x_res, + y_resolution=y_res, + resolution_unit=res_unit, + force_bigtiff=bigtiff, + photometric=photometric, + ) + + _write_bytes(file_bytes, path) + + diff --git a/xrspatial/geotiff/_writers/vrt.py b/xrspatial/geotiff/_writers/vrt.py new file mode 100644 index 00000000..cbaf6480 --- /dev/null +++ b/xrspatial/geotiff/_writers/vrt.py @@ -0,0 +1,90 @@ +"""VRT writer entry point. + +Wraps ``_vrt.write_vrt`` with the public ``write_vrt`` surface: +deprecation handling for ``crs_wkt`` (#1715), normalisation of the +``crs`` kwarg to WKT via ``_resolve_crs_to_wkt``, and the parity +docstring vs ``to_geotiff`` / ``write_geotiff_gpu``. +""" +from __future__ import annotations + +import warnings + +from .._crs import _resolve_crs_to_wkt +from .._runtime import _CRS_WKT_DEPRECATED_SENTINEL + + +def write_vrt(vrt_path: str, source_files: list[str], *, + relative: bool = True, + crs: int | str | None = None, + crs_wkt: str | None = _CRS_WKT_DEPRECATED_SENTINEL, + nodata: float | int | None = None) -> str: + """Generate a VRT file that mosaics multiple GeoTIFF tiles. + + Parameters + ---------- + vrt_path : str + Output .vrt file path. + source_files : list of str + Paths to the source GeoTIFF files. + relative : bool, optional + Store source paths relative to the VRT file (default True). + crs : int, str, or None, optional + EPSG code (int), WKT string, or PROJ string. If None, the CRS + is taken from the first source GeoTIFF. Mirrors the ``crs`` + kwarg on ``to_geotiff`` and ``write_geotiff_gpu`` so the same + value can be forwarded to whichever writer the caller picked + without per-writer special-casing (issue #1715). + crs_wkt : str or None, optional + Deprecated alias for ``crs``. Emits ``DeprecationWarning`` when + supplied (including ``crs_wkt=None``); passing both ``crs`` and + ``crs_wkt`` raises ``TypeError``. The value is forwarded through + the same ``_resolve_crs_to_wkt`` path as ``crs``, so any string + the resolver accepts (WKT root keyword, PROJ string, + ``"EPSG:NNNN"``) and ``None`` work here. The historic + ``str | None`` surface is preserved; new code should use ``crs`` + instead, which additionally accepts ``int`` EPSG codes. + nodata : float, int, or None, optional + NoData value. If None, taken from the first source GeoTIFF. + Integer sentinels (e.g. ``65535`` for uint16, ``-9999`` for + int32) are accepted so the surface lines up with the + ``nodata`` kwarg on ``to_geotiff`` and ``write_geotiff_gpu``. + + Returns + ------- + str + Path to the written VRT file. + """ + # Explicit signature (previously ``**kwargs``) so ``inspect.signature``, + # IDE autocomplete, and ``mypy --strict`` can see the accepted kwargs + # without parsing the docstring. Mirrors ``_vrt.write_vrt`` for the + # historic ``crs_wkt`` path; the new ``crs`` path normalises through + # ``_resolve_crs_to_wkt`` before forwarding because the internal + # writer still only speaks WKT. + crs_wkt_passed = crs_wkt is not _CRS_WKT_DEPRECATED_SENTINEL + if crs is not None and crs_wkt_passed: + # Both supplied is ambiguous regardless of whether the WKT happens + # to encode the same CRS as the int. Refuse rather than silently + # picking one. + raise TypeError( + "write_vrt: pass either 'crs' or the deprecated 'crs_wkt' " + "alias, not both.") + if crs_wkt_passed: + warnings.warn( + "write_vrt(..., crs_wkt=...) is deprecated; use crs=... " + "instead. The kwarg was renamed for parity with to_geotiff " + "and write_geotiff_gpu, which already accept 'crs' as either " + "an int EPSG code or a WKT string.", + DeprecationWarning, + stacklevel=2, + ) + crs = crs_wkt + + resolved_wkt = _resolve_crs_to_wkt(crs) + + from .._vrt import write_vrt as _write_vrt_internal + return _write_vrt_internal( + vrt_path, source_files, + relative=relative, + crs_wkt=resolved_wkt, + nodata=nodata, + ) diff --git a/xrspatial/geotiff/tests/test_to_geotiff_gpu_fallback_1674.py b/xrspatial/geotiff/tests/test_to_geotiff_gpu_fallback_1674.py index a8a5463d..2451a0db 100644 --- a/xrspatial/geotiff/tests/test_to_geotiff_gpu_fallback_1674.py +++ b/xrspatial/geotiff/tests/test_to_geotiff_gpu_fallback_1674.py @@ -69,9 +69,12 @@ def _patch_gpu_writer_to_raise(monkeypatch, exc): stub that raises ``exc``. ``to_geotiff`` calls ``write_geotiff_gpu`` directly inside its own - module, so the patch targets the module-level name there. + defining module (``_writers.eager`` since #1888), so the patch + targets the module-level name there. Patching ``xrspatial.geotiff`` + would only update the package re-export and would not intercept the + actual call site. """ - from xrspatial import geotiff as g + from xrspatial.geotiff._writers import eager as g def _boom(*args, **kwargs): raise exc @@ -253,9 +256,11 @@ def test_auto_detected_gpu_fallback_warns( from xrspatial.geotiff import to_geotiff # Synthesise a "CuPy-looking" DataArray via _is_gpu_data's hook. - # Easiest: patch _is_gpu_data to True. The CPU fallback then - # operates on the numpy buffer underneath. - from xrspatial import geotiff as g + # Easiest: patch _is_gpu_data to True in the writer module that + # actually calls it (the to_geotiff body lives in _writers.eager + # since #1888). The CPU fallback then operates on the numpy buffer + # underneath. + from xrspatial.geotiff._writers import eager as g monkeypatch.setattr(g, '_is_gpu_data', lambda data: True, raising=True) _patch_gpu_writer_to_raise(monkeypatch, ImportError("no cupy")) @@ -290,7 +295,7 @@ def test_auto_detected_gpu_runtime_error_falls_back_with_warning( must use the same template so call sites do not diverge over time. """ from xrspatial.geotiff import to_geotiff - from xrspatial import geotiff as g + from xrspatial.geotiff._writers import eager as g monkeypatch.setattr(g, '_is_gpu_data', lambda data: True, raising=True) _patch_gpu_writer_to_raise( From 47a39acaba05f2179a8f8b4f3138282f5dd7b10b Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Thu, 14 May 2026 21:55:06 -0700 Subject: [PATCH 2/2] geotiff: address Copilot review on _writers extraction (#1899) Drop the unused module-level and in-function imports Copilot flagged: eager.py - Drop ``import math`` (no math.* calls in the file). - Drop ``RASTER_PIXEL_IS_POINT`` re-import (only RASTER_PIXEL_IS_AREA is referenced; the POINT path lives in _writers/_vrt_tiled below and uses its own internal constants). - Drop ``_validate_dtype_cast`` re-import (never called; the writer side does not validate dtype casts post-decode). - Drop ``from .vrt import write_vrt`` re-import; ``_write_vrt_tiled`` does a lazy ``from .._vrt import write_vrt as _write_vrt_fn`` at its single call site so the module-top import was dead. - Drop the redundant ``import os`` inside ``_write_vrt_tiled``; the module-level ``import os`` already covers the os.path / os.makedirs calls in that body. gpu.py - Drop ``import math`` and ``import os`` (no math.* / os.* references in the file body). - Drop ``GeoTransform`` re-import at module level (only used in eager.py's _write_single_tile, which still imports it). - Drop ``_validate_dtype_cast`` re-import (never called). - Drop ``from .._writer import write`` re-import; only docstring references mention it. - Drop the in-function ``GeoTransform as _GT`` alias inside the ``_writer`` import block (the alias was carried over verbatim from __init__.py but the function body never references _GT). - Drop the in-function ``from .._dtypes import numpy_to_tiff_dtype`` (also unreferenced in the body). All 2862 geotiff tests still pass; pyflakes is clean on these files except for the ``cupy.ndarray`` type annotation in ``write_geotiff_gpu``'s signature, which works fine under ``from __future__ import annotations`` and matched the same surface the original __init__.py had through earlier #1813 PRs. --- xrspatial/geotiff/_writers/eager.py | 7 +------ xrspatial/geotiff/_writers/gpu.py | 7 ------- 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/xrspatial/geotiff/_writers/eager.py b/xrspatial/geotiff/_writers/eager.py index 16d3f4fd..7dfce7c0 100644 --- a/xrspatial/geotiff/_writers/eager.py +++ b/xrspatial/geotiff/_writers/eager.py @@ -10,7 +10,6 @@ """ from __future__ import annotations -import math import os import warnings from typing import TYPE_CHECKING @@ -34,7 +33,7 @@ transform_from_attr as _transform_from_attr, ) from .._crs import _wkt_to_epsg -from .._geotags import GeoTransform, RASTER_PIXEL_IS_AREA, RASTER_PIXEL_IS_POINT +from .._geotags import GeoTransform, RASTER_PIXEL_IS_AREA from .._runtime import ( GeoTIFFFallbackWarning, _geotiff_strict_mode, @@ -42,12 +41,10 @@ ) from .._validation import ( _validate_3d_writer_dims, - _validate_dtype_cast, _validate_tile_size_arg, ) from .._writer import write from .gpu import write_geotiff_gpu -from .vrt import write_vrt def to_geotiff(data: xr.DataArray | np.ndarray, @@ -674,8 +671,6 @@ def _write_vrt_tiled(data, vrt_path, *, crs=None, nodata=None, This enables streaming dask arrays to disk without materializing the full array in RAM. """ - import os - # Validate compression_level against codec-specific range if compression_level is not None: level_range = _LEVEL_RANGES.get(compression.lower()) diff --git a/xrspatial/geotiff/_writers/gpu.py b/xrspatial/geotiff/_writers/gpu.py index 43faa662..155253fc 100644 --- a/xrspatial/geotiff/_writers/gpu.py +++ b/xrspatial/geotiff/_writers/gpu.py @@ -7,8 +7,6 @@ """ from __future__ import annotations -import math -import os import warnings from typing import TYPE_CHECKING @@ -25,14 +23,11 @@ transform_from_attr as _transform_from_attr, ) from .._crs import _wkt_to_epsg -from .._geotags import GeoTransform from .._runtime import GeoTIFFFallbackWarning from .._validation import ( _validate_3d_writer_dims, - _validate_dtype_cast, _validate_tile_size_arg, ) -from .._writer import write def write_geotiff_gpu(data: xr.DataArray | cupy.ndarray | np.ndarray, @@ -286,9 +281,7 @@ def write_geotiff_gpu(data: xr.DataArray | cupy.ndarray | np.ndarray, from .._writer import ( _compression_tag, _assemble_tiff, _write_bytes, normalize_predictor, - GeoTransform as _GT, ) - from .._dtypes import numpy_to_tiff_dtype # Extract array and metadata geo_transform = None