From 70062aebde6042899f532dfa7455506b109e6910 Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Thu, 14 May 2026 20:48:33 -0700
Subject: [PATCH 1/2] geotiff: extract read_geotiff_gpu to _backends/gpu.py
 (#1885)

Step 7 of #1813's multi-PR refactor of __init__.py. Pure code motion;
no public API change.

Moved into a new xrspatial/geotiff/_backends/gpu.py:

- read_geotiff_gpu: GPU read entry point (~640 lines, the largest
  single function in __init__.py).
- _gds_chunk_path_available: predicate gating the GDS direct-to-GPU
  chunked path.
- _decode_window_gpu_direct: per-chunk disk->GPU decode used by the
  GDS path.
- _read_geotiff_gpu_chunked: lazy Dask+CuPy graph builder for
  read_geotiff_gpu(chunks=...).
- _read_geotiff_gpu_chunked_gds: direct-to-GPU specialisation built
  by _read_geotiff_gpu_chunked when KvikIO + a chunky tiled local
  file qualifies.

Function bodies are unchanged except for adjusted relative imports
(``._X`` -> ``.._X`` outside _backends, ``._gpu_helpers`` for the
already-extracted GPU helpers). __init__.py re-exports
read_geotiff_gpu so ``from xrspatial.geotiff import read_geotiff_gpu``
keeps working. The four private helpers are not re-exported because
they were only called by read_geotiff_gpu itself; nothing outside
the GPU module references them.

One known consumer of the private names had to update its monkeypatch
target: test_gpu_chunks_out_of_core_1876.py patches
``_decode_window_gpu_direct`` to spy on chunk-decode calls. The patch
target moves from ``xrspatial.geotiff`` to
``xrspatial.geotiff._backends.gpu`` so the spy still intercepts the
real call site (module-global lookup happens in the defining module).

The Dask+CuPy chunked-fallback path needs ``read_geotiff_dask``,
which still lives in __init__.py. To avoid a circular import,
_read_geotiff_gpu_chunked lazy-imports it inside the function body
(``from .. import read_geotiff_dask``). PR #8 will move
``read_geotiff_dask`` into ``_backends/dask.py`` and the lazy import
can be replaced with a static one then.

Net __init__.py change: 3969 -> 2962 lines (-1007). _backends/gpu.py
is 1047 lines.

Verification: pixel-parity matrix from #1889 (192 cells, all GPU
cells), runtime identity (9), GPU byteswap (#1508), GPU kwarg rename
(#1560), GPU chunks out-of-core (#1876, including the GDS spy
tests), full geotiff suite minus 8 pre-existing main failures
(read_to_array monkeypatch + tile_size_positive_works) -- 2862/2862
pass.

Refs #1813.
---
 xrspatial/geotiff/__init__.py                 | 1009 +---------------
 xrspatial/geotiff/_backends/gpu.py            | 1047 +++++++++++++++++
 .../tests/test_gpu_chunks_out_of_core_1876.py |    4 +-
 3 files changed, 1050 insertions(+), 1010 deletions(-)
 create mode 100644 xrspatial/geotiff/_backends/gpu.py

diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py
index f0b4cde8..6c8d364a 100644
--- a/xrspatial/geotiff/__init__.py
+++ b/xrspatial/geotiff/__init__.py
@@ -76,6 +76,7 @@
     _gpu_decode_single_band_tiles,
     _is_gpu_data,
 )
+from ._backends.gpu import read_geotiff_gpu
 from ._crs import _resolve_crs_to_wkt, _wkt_to_epsg
 from ._reader import read_to_array as _read_to_array
 from ._runtime import (
@@ -1787,1014 +1788,6 @@ def _read(http_meta):
     return _read(http_meta_key)
 
 
-def read_geotiff_gpu(source: str, *,
-                     dtype: str | np.dtype | None = None,
-                     overview_level: int | None = None,
-                     window: tuple | None = None,
-                     band: int | None = None,
-                     name: str | None = None,
-                     chunks: int | tuple | None = None,
-                     max_pixels: int | None = None,
-                     on_gpu_failure: str = _ON_GPU_FAILURE_SENTINEL,
-                     gpu: str = _GPU_DEPRECATED_SENTINEL,
-                     ) -> xr.DataArray:
-    """Read a GeoTIFF with GPU-accelerated decompression via Numba CUDA.
-
-    Decompresses all tiles in parallel on the GPU and returns a
-    CuPy-backed DataArray that stays on device memory. No CPU->GPU
-    transfer needed for downstream xrspatial GPU operations.
-
-    With ``chunks=``, returns a Dask+CuPy DataArray with real
-    out-of-core memory bounds: each chunk reads only the tiles for its
-    window (via the CPU dask path) and uploads the result to the
-    device, so peak GPU memory is one chunk rather than the whole
-    raster. The trade-off is per-chunk CPU decode rather than bulk GPU
-    decode; for rasters that fit on the device, ``chunks=None`` keeps
-    the full GPU-decode fast path.
-
-    Requires: cupy, numba with CUDA support.
-
-    Parameters
-    ----------
-    source : str
-        File path.
-    dtype : str, numpy.dtype, or None
-        Cast the result to this dtype after reading. None keeps the
-        file's native dtype. Float-to-int casts raise ValueError, mirroring
-        ``open_geotiff`` / ``read_geotiff_dask``.
-    overview_level : int or None
-        Overview level (0 = full resolution).
-    window : tuple or None
-        ``(row_start, col_start, row_stop, col_stop)`` for windowed
-        reading. None reads the full raster. The GPU pipeline currently
-        decodes all tiles and slices on device after assembly, so the
-        kwarg restores API parity with ``open_geotiff`` and
-        ``read_geotiff_dask`` but does not yet skip I/O for partial
-        windows. The returned coords, ``attrs['transform']``, and
-        shape match the eager numpy path.
-    band : int or None
-        Zero-based band index. None returns all bands (3D output for
-        multi-band files, 2D for single-band). Selecting a single band
-        yields a 2D DataArray.
-    chunks : int, tuple, or None
-        If set, return a Dask-chunked CuPy DataArray decoded one chunk
-        at a time. int for square chunks, (row, col) tuple for
-        rectangular. Each chunk task reads only the tiles overlapping
-        its window (CPU decode) and uploads the result to the device,
-        so peak GPU memory is bounded by chunk size. ``chunks=None``
-        (default) decodes the full raster on the GPU in one pass.
-    name : str or None
-        Name for the DataArray.
-    max_pixels : int or None
-        Maximum allowed pixel count (width * height * samples). None
-        uses the default (~1 billion).
-    on_gpu_failure : {'auto', 'strict'}, default 'auto'
-        Behaviour when any GPU decode stage raises an exception.
-
-        The GPU pipeline has two stages: first ``gpu_decode_tiles_from_file``
-        (GDS-style direct read), then ``gpu_decode_tiles`` over CPU-mmap
-        extracted tile bytes. Both stages still run on the GPU. The CPU
-        fallback (``read_to_array`` + ``cupy.asarray``) only fires after
-        both GPU stages have failed.
-
-        - ``'auto'``: each GPU-stage failure emits a ``RuntimeWarning``
-          reporting the original exception type and message, then falls
-          through to the next stage (CPU mmap re-decode for the first
-          failure, full CPU decode + GPU transfer for the second). This
-          preserves backward-compatible behaviour while making GPU
-          regressions visible.
-        - ``'strict'``: re-raise the original exception from either stage
-          so GPU bugs surface immediately. Useful in tests and CI for the
-          GPU fast path.
-
-        Stripped layouts and sparse-tile files route directly to the CPU
-        reader before either GPU decode stage runs, so the ``on_gpu_failure``
-        kwarg does not affect them. A failure inside the subsequent
-        ``cupy.asarray(...)`` upload propagates unchanged in both modes.
-    gpu : str, optional
-        Deprecated alias for ``on_gpu_failure``. Emits ``DeprecationWarning``
-        when used. Passing both ``gpu`` and ``on_gpu_failure`` raises
-        ``TypeError``. The old name shipped with values ``'auto'`` /
-        ``'strict'`` and was easy to confuse with the boolean ``gpu=``
-        kwarg on ``open_geotiff`` / ``to_geotiff`` / ``read_vrt``.
-
-    Returns
-    -------
-    xr.DataArray
-        CuPy-backed DataArray on GPU device.
-    """
-    new_passed = on_gpu_failure is not _ON_GPU_FAILURE_SENTINEL
-    old_passed = gpu is not _GPU_DEPRECATED_SENTINEL
-    if new_passed and old_passed:
-        # Both supplied is ambiguous regardless of which values were
-        # chosen (including the matching ``on_gpu_failure='auto',
-        # gpu='auto'`` pair). Refuse rather than silently picking one.
-        raise TypeError(
-            "read_geotiff_gpu: pass either 'on_gpu_failure' or the "
-            "deprecated 'gpu' alias, not both.")
-    if old_passed:
-        warnings.warn(
-            "read_geotiff_gpu(..., gpu=...) is deprecated; use "
-            "on_gpu_failure=... instead. The kwarg was renamed because "
-            "'gpu' on open_geotiff/to_geotiff/read_vrt is a bool that "
-            "selects the GPU backend, while here it selects the failure "
-            "policy when the GPU path raises.",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-        on_gpu_failure = gpu
-    elif not new_passed:
-        on_gpu_failure = 'auto'
-    gpu = on_gpu_failure
-    if gpu not in ('auto', 'strict'):
-        raise ValueError(
-            f"on_gpu_failure must be 'auto' or 'strict', got {gpu!r}")
-    # Reject non-positive chunk sizes up front so the GPU dask+cupy path
-    # surfaces the same error as ``read_geotiff_dask`` (#1776). Previously
-    # ``chunks=0`` raised ``ZeroDivisionError`` deep in cupy/dask, and
-    # ``chunks=-1`` was silently accepted (negative chunks fall out of
-    # the dask chunk grid as a no-op). ``chunks=None`` is the default
-    # (eager read), so allow it through here.
-    chunks = _validate_chunks_arg(chunks, allow_none=True)
-    try:
-        import cupy
-    except ImportError:
-        raise ImportError(
-            "cupy is required for GPU reads. "
-            "Install it with: pip install cupy-cuda12x")
-
-    # When ``chunks=`` is set, bound peak GPU memory to chunk size by
-    # building a Dask+CuPy graph that decodes one chunk at a time. The
-    # CPU dask path already lays out a window-per-chunk delayed graph
-    # (parses TIFF metadata once, decodes only the tiles overlapping
-    # each chunk window, handles HTTP/fsspec/local/sparse/planar=2/
-    # MinIsWhite/nodata/orientation). Reusing it and uploading each
-    # block to the device via ``map_blocks(cupy.asarray)`` gives real
-    # out-of-core behaviour for the read; the trade-off is per-chunk
-    # CPU decode rather than the eager path's bulk GPU decode. Users
-    # who want full GPU-side decode (and have device memory for the
-    # whole image) pass ``chunks=None``. See issue #1876.
-    if chunks is not None:
-        return _read_geotiff_gpu_chunked(
-            source, dtype=dtype, chunks=chunks,
-            overview_level=overview_level, window=window, band=band,
-            name=name, max_pixels=max_pixels,
-        )
-
-    from ._reader import (
-        _FileSource, _check_dimensions, MAX_PIXELS_DEFAULT, _coerce_path,
-        _resolve_masked_fill,
-    )
-    from ._compression import COMPRESSION_LERC
-    from ._header import (
-        parse_header, parse_all_ifds, select_overview_ifd, validate_tile_layout,
-    )
-    from ._dtypes import resolve_bits_per_sample, tiff_dtype_to_numpy
-    from ._geotags import extract_geo_info_with_overview_inheritance
-    from ._gpu_decode import gpu_decode_tiles
-
-    source = _coerce_path(source)
-
-    if max_pixels is None:
-        max_pixels = MAX_PIXELS_DEFAULT
-
-    # Window basic shape check happens here; full bounds-vs-file validation
-    # runs after the IFD parse below so we can compare against the chosen
-    # overview level's actual height/width. ``band`` is similarly validated
-    # against ``ifd.samples_per_pixel`` after the header parse.
-    if window is not None:
-        if len(window) != 4:
-            raise ValueError(
-                f"window must be a 4-tuple (r0, c0, r1, c1), got {window!r}")
-        w_r0, w_c0, w_r1, w_c1 = window
-        if w_r0 >= w_r1 or w_c0 >= w_c1 or w_r0 < 0 or w_c0 < 0:
-            raise ValueError(
-                f"window={window} has non-positive size or negative origin.")
-
-    # Parse metadata on CPU (fast, <1ms)
-    src = _FileSource(source)
-    data = src.read_all()
-
-    try:
-        header = parse_header(data)
-        ifds = parse_all_ifds(data, header)
-
-        if len(ifds) == 0:
-            raise ValueError("No IFDs found in TIFF file")
-
-        # Skip mask IFDs (NewSubfileType bit 2)
-        ifd = select_overview_ifd(ifds, overview_level)
-
-        bps = resolve_bits_per_sample(ifd.bits_per_sample)
-        file_dtype = tiff_dtype_to_numpy(bps, ifd.sample_format)
-        # Inherit georef from the level-0 IFD when the overview itself
-        # has no geokeys (issue #1640); pass-through for level 0.
-        geo_info = extract_geo_info_with_overview_inheritance(
-            ifd, ifds, data, header.byte_order)
-        # Capture the Orientation tag (274) once so the post-decode flip
-        # below picks it up for both the stripped fallback and the tiled
-        # GPU pipelines. CPU read_to_array applies the array remap +
-        # transform update for stripped reads, so for that branch we
-        # only need to copy the post-flip geo_info back here.
-        orientation = ifd.orientation
-
-        # Orientation tag (274): values 2-8 mean the stored pixel order
-        # differs from display order. A windowed read against a non-default
-        # orientation has ambiguous semantics (does the window refer to
-        # file pixels or display pixels?), so the CPU reader
-        # ``_reader.read_to_array`` rejects ``window=`` for orientation != 1.
-        # Mirror that here so the GPU path agrees with the CPU path and
-        # ``read_geotiff_dask``. Use the same error wording so the failure
-        # message is identical across backends.
-        if orientation != 1 and window is not None:
-            raise ValueError(
-                f"Orientation tag (274) is {orientation}; windowed reads "
-                f"(window=...) and dask-chunked reads (chunks=...) are not "
-                f"supported for non-default orientation. Read the full "
-                f"array first, then slice."
-            )
-
-        # Validate band against the selected IFD's sample count.
-        # ``samples_per_pixel`` is at least 1 for any valid TIFF; we treat
-        # ``band=0`` as "first band" for single-band files too so the
-        # behaviour mirrors ``read_geotiff_dask``.
-        ifd_samples = ifd.samples_per_pixel
-        if band is not None:
-            # Reject ``bool`` and ``np.bool_`` up front;
-            # ``isinstance(True, int)`` is True in Python so
-            # ``True < ifd_samples`` evaluates without raising and silently
-            # reads band 1. ``np.bool_`` is not a subclass of ``bool`` so it
-            # needs its own check to match the VRT path's rejection.
-            # See #1786.
-            if isinstance(band, (bool, np.bool_)):
-                raise ValueError(
-                    f"band must be a non-negative int, got {band!r}")
-            if ifd_samples <= 1:
-                if band != 0:
-                    raise IndexError(
-                        f"band={band} requested on a single-band file.")
-            elif not 0 <= band < ifd_samples:
-                raise IndexError(
-                    f"band={band} out of range for {ifd_samples}-band file.")
-
-        # Validate window upper bounds against the selected IFD's extent.
-        if window is not None:
-            w_r0, w_c0, w_r1, w_c1 = window
-            ifd_h, ifd_w = ifd.height, ifd.width
-            if w_r1 > ifd_h or w_c1 > ifd_w:
-                raise ValueError(
-                    f"window={window} is outside the source extent "
-                    f"({ifd_h}x{ifd_w}).")
-
-        if not ifd.is_tiled:
-            # Fall back to CPU for stripped files. read_to_array remaps
-            # the array but only updates geo_info.transform for orientations
-            # 5-8 today (the 2/3/4 fix in #1539 is in a sibling PR). Discard
-            # its geo_info and apply our own transform update below so the
-            # result is correct regardless of merge order.
-            #
-            # Forward ``max_pixels``, ``window``, and ``band`` so the
-            # caller's safety cap is honoured, windowed reads avoid
-            # decoding the full image, and single-band selection on a
-            # multi-band source skips the unused channels. Without this,
-            # the stripped GPU path bypassed all three (issue #1732).
-            # Orientation != 1 + window is already rejected at line 2495,
-            # so ``window`` is None whenever ``geo_info`` will be remapped
-            # below.
-            src.close()
-            arr_cpu, _stripped_geo = _read_to_array(
-                source, overview_level=overview_level,
-                window=window, band=band, max_pixels=max_pixels)
-            arr_gpu = cupy.asarray(arr_cpu)
-            if orientation != 1:
-                geo_info = _apply_orientation_geo_info(
-                    geo_info, orientation,
-                    file_h=ifd.height, file_w=ifd.width)
-            if name is None:
-                import os
-                name = os.path.splitext(os.path.basename(source))[0]
-            attrs = {}
-            _populate_attrs_from_geo_info(attrs, geo_info, window=window)
-            # Apply nodata mask + record sentinel so the GPU read agrees
-            # with the CPU eager path. Without this, integer rasters keep
-            # the literal sentinel value and float rasters keep the
-            # sentinel rather than NaN -- a silent backend divergence.
-            # ``read_to_array`` stashes the post-MinIsWhite sentinel on
-            # ``_mask_nodata`` when applicable; fall back to the original
-            # sentinel otherwise (#1809).
-            nodata = geo_info.nodata
-            if nodata is not None:
-                attrs['nodata'] = nodata
-                mask_value = getattr(_stripped_geo, '_mask_nodata', nodata)
-                arr_gpu = _apply_nodata_mask_gpu(arr_gpu, mask_value)
-            if dtype is not None:
-                target = np.dtype(dtype)
-                _validate_dtype_cast(np.dtype(str(arr_gpu.dtype)), target)
-                arr_gpu = arr_gpu.astype(target)
-            # ``read_to_array`` already applied window + band slicing, so
-            # ``arr_gpu`` is at output shape. Compute coords for that
-            # shape without re-slicing. Mirror the eager-numpy /
-            # ``read_geotiff_dask`` / ``_gpu_apply_window_band`` checks
-            # against ``has_georef``: a non-georef TIFF carries a
-            # default ``GeoTransform()`` placeholder (``t is None`` is
-            # never true here) so a transform-based coord path would
-            # emit synthetic ``[-0.5, -1.5, ...]`` floats instead of
-            # the integer pixel coords every other backend produces
-            # (#1753 / regression of #1710).
-            coords = _coords_from_geo_info(
-                geo_info, arr_gpu.shape[0], arr_gpu.shape[1], window=window,
-            )
-            # Multi-band stripped reads come back as (y, x, band); mirror
-            # the tiled branch so dims line up with ndim. Single-band stays
-            # 2-D ('y', 'x').
-            if arr_gpu.ndim == 3:
-                dims = ['y', 'x', 'band']
-                coords['band'] = np.arange(arr_gpu.shape[2])
-            else:
-                dims = ['y', 'x']
-            result = xr.DataArray(arr_gpu, dims=dims,
-                                  coords=coords, name=name, attrs=attrs)
-            # ``chunks`` was previously honoured only on the tiled path,
-            # so stripped TIFFs returned an unchunked DataArray even when
-            # the caller asked for a Dask+CuPy result. Mirror the tiled
-            # branch's chunking step so behaviour is consistent across
-            # layouts.
-            if chunks is not None:
-                if isinstance(chunks, int):
-                    chunk_dict = {'y': chunks, 'x': chunks}
-                else:
-                    chunk_dict = {'y': chunks[0], 'x': chunks[1]}
-                result = result.chunk(chunk_dict)
-            return result
-
-        offsets = ifd.tile_offsets
-        byte_counts = ifd.tile_byte_counts
-        compression = ifd.compression
-        predictor = ifd.predictor
-        samples = ifd.samples_per_pixel
-        planar = ifd.planar_config
-        tw = ifd.tile_width
-        th = ifd.tile_height
-        width = ifd.width
-        height = ifd.height
-
-        if tw <= 0 or th <= 0:
-            raise ValueError(
-                f"Invalid tile dimensions: TileWidth={tw}, TileLength={th}")
-
-        _check_dimensions(width, height, samples, max_pixels)
-        # A single tile's decoded bytes must also fit under the pixel budget.
-        _check_dimensions(tw, th, samples, max_pixels)
-
-        # Reject malformed TIFFs whose declared tile grid exceeds the
-        # supplied TileOffsets length. The GPU tile-assembly kernel would
-        # read OOB otherwise. See issue #1219.
-        validate_tile_layout(ifd)
-
-    finally:
-        src.close()
-
-    # GPU decode: try GDS (SSD→GPU direct) first, then CPU mmap path.
-    # Sparse tiles (byte_count == 0) are unsupported on the GPU pipeline;
-    # the CPU reader fills them with nodata and copies onto the GPU.
-    has_sparse_tile = any(bc == 0 for bc in byte_counts)
-    # LERC tiles can carry a per-pixel valid mask that GDAL writes
-    # zero-filled in the data array.  Compute the nodata fill the same
-    # way the CPU reader does so the GPU decode path can restore it
-    # post-assembly (mirrors PR #1529 for the CPU path). Only the
-    # chunky (planar=1) GPU path threads masked_fill into its kernel
-    # call below; the planar=2 per-band branch falls back to the CPU
-    # reader for masked pixels (rare in practice -- LERC files
-    # typically use chunky layout).
-    masked_fill = (_resolve_masked_fill(ifd.nodata_str, file_dtype)
-                   if compression == COMPRESSION_LERC else None)
-
-    # Track whether the array we end up with was already orientation-flipped
-    # by `read_to_array`. Any path that falls back to CPU decode picks up
-    # the orientation remap from PR #1521 + #1537 for free; the pure GPU
-    # paths still need the explicit remap added in #1540.
-    arr_was_cpu_decoded = False
-    # When a CPU fallback runs, ``read_to_array`` has already applied the
-    # MinIsWhite inversion and stashed the post-inversion sentinel on
-    # ``_mask_nodata``. Keep that geo_info alongside the pre-extracted one
-    # so the downstream nodata mask compares against the correct value
-    # (Copilot review of #1817).
-    _cpu_fallback_geo = None
-
-    # PlanarConfiguration=2 (separate bands): each band has its own list
-    # of tiles back-to-back in TileOffsets / TileByteCounts. The GPU
-    # tile-assembly kernel assumes a single chunky tile sequence with
-    # bytes_per_pixel = itemsize * samples, so it cannot handle planar=2
-    # directly. Decode each band's tile slab as a single-band image, then
-    # stack into (H, W, samples). For planar=1 (chunky) the existing
-    # single-pass kernel is correct. Sparse-tile files always route to
-    # the CPU reader regardless of planar config.
-    if planar == 2 and samples > 1 and not has_sparse_tile:
-        tiles_across = math.ceil(width / tw)
-        tiles_down = math.ceil(height / th)
-        tiles_per_band = tiles_across * tiles_down
-        # validate_tile_layout already requires len(offsets) >= the grid;
-        # accept extra trailing entries (some writers emit padding) and
-        # only consume the first tiles_per_band * samples.
-        expected_min = tiles_per_band * samples
-        if len(offsets) < expected_min:
-            raise ValueError(
-                f"PlanarConfiguration=2 expects at least {expected_min} "
-                f"TileOffsets entries ({tiles_across} x {tiles_down} x "
-                f"{samples} bands), got {len(offsets)}"
-            )
-        # Lazy shared file read for the per-band stage-2 fallback. When
-        # every band's GDS path succeeds, _read_once is never called
-        # and we skip the read_all() entirely; when any band falls
-        # back, the first call materialises the bytes and subsequent
-        # bands reuse the same buffer (so N bands cost at most one
-        # read_all(), not N).
-        _shared_data_cache: list = []
-
-        def _read_once():
-            if not _shared_data_cache:
-                src2 = _FileSource(source)
-                try:
-                    _shared_data_cache.append(src2.read_all())
-                finally:
-                    src2.close()
-            return _shared_data_cache[0]
-
-        band_arrays = []
-        cpu_fallback_needed = False
-        for band_idx in range(samples):
-            b0 = band_idx * tiles_per_band
-            b1 = b0 + tiles_per_band
-            band_offsets = list(offsets[b0:b1])
-            band_byte_counts = list(byte_counts[b0:b1])
-            band_arr = _gpu_decode_single_band_tiles(
-                source, _read_once, band_offsets, band_byte_counts,
-                tw, th, width, height,
-                compression, predictor, file_dtype,
-                byte_order=header.byte_order,
-                gpu=gpu,
-            )
-            if band_arr is None:
-                # Auto-mode signal: stage-2 GPU decode failed for this
-                # band. There's no per-band CPU decode path, so fall
-                # back to a whole-image CPU read + GPU upload, matching
-                # the chunky path's auto-mode semantics.
-                cpu_fallback_needed = True
-                break
-            band_arrays.append(band_arr)
-        if cpu_fallback_needed:
-            # Drop read_to_array's geo_info for orientation transform
-            # handling (below operates on our pre-extracted geo_info so the
-            # 2/3/4 case is covered regardless of #1539's merge state), but
-            # keep it on ``_cpu_fallback_geo`` so the MinIsWhite-aware nodata
-            # mask below sees ``_mask_nodata``.
-            arr_cpu, _cpu_fallback_geo = _read_to_array(
-                source, overview_level=overview_level)
-            arr_gpu = cupy.asarray(arr_cpu)
-            arr_was_cpu_decoded = True
-        else:
-            arr_gpu = cupy.stack(band_arrays, axis=2)
-            if arr_gpu.shape != (height, width, samples):
-                raise RuntimeError(
-                    f"planar=2 GPU assembly produced shape "
-                    f"{arr_gpu.shape}, expected "
-                    f"({height}, {width}, {samples})"
-                )
-    elif has_sparse_tile:
-        arr_cpu, _cpu_fallback_geo = _read_to_array(
-            source, overview_level=overview_level)
-        arr_gpu = cupy.asarray(arr_cpu)
-        arr_was_cpu_decoded = True
-    else:
-        from ._gpu_decode import gpu_decode_tiles_from_file
-        arr_gpu = None
-
-        try:
-            arr_gpu = gpu_decode_tiles_from_file(
-                source, offsets, byte_counts,
-                tw, th, width, height,
-                compression, predictor, file_dtype, samples,
-                byte_order=header.byte_order,
-                masked_fill=masked_fill,
-            )
-        except Exception as e:
-            if gpu == 'strict' or _geotiff_strict_mode():
-                raise
-            warnings.warn(
-                f"read_geotiff_gpu: GPU decode failed "
-                f"({type(e).__name__}: {e}); falling back to CPU.",
-                RuntimeWarning,
-                stacklevel=2,
-            )
-            arr_gpu = None
-
-    if arr_gpu is None:
-        # Fallback: extract tiles via CPU mmap, then GPU decode
-        src2 = _FileSource(source)
-        data2 = src2.read_all()
-        try:
-            compressed_tiles = [
-                bytes(data2[offsets[i]:offsets[i] + byte_counts[i]])
-                for i in range(len(offsets))
-            ]
-        finally:
-            src2.close()
-
-    if arr_gpu is None:
-        try:
-            arr_gpu = gpu_decode_tiles(
-                compressed_tiles,
-                tw, th, width, height,
-                compression, predictor, file_dtype, samples,
-                byte_order=header.byte_order,
-                masked_fill=masked_fill,
-            )
-        except Exception as e:
-            if gpu == 'strict' or _geotiff_strict_mode():
-                raise
-            warnings.warn(
-                f"read_geotiff_gpu: GPU decode failed "
-                f"({type(e).__name__}: {e}); falling back to CPU.",
-                RuntimeWarning,
-                stacklevel=2,
-            )
-            arr_cpu, _cpu_fallback_geo = _read_to_array(
-                source, overview_level=overview_level)
-            arr_gpu = cupy.asarray(arr_cpu)
-            arr_was_cpu_decoded = True
-
-    # Multi-band tiled output must be (H, W, samples) regardless of planar
-    # config -- catch any shape regression in the kernels before we attach
-    # dims/coords below. Plain `raise` rather than `assert` so the check
-    # survives `python -O`.
-    if samples > 1:
-        if (arr_gpu.shape[:2] != (height, width)
-                or arr_gpu.shape[2] != samples):
-            raise RuntimeError(
-                f"GPU multi-band tile assembly produced shape "
-                f"{arr_gpu.shape}, expected "
-                f"({height}, {width}, {samples})"
-            )
-
-    # Apply the TIFF Orientation tag (274). The pure GPU paths land here
-    # with a raw stored-order buffer; the CPU-fallback paths land here
-    # with arr_gpu already remapped (read_to_array does the data flip)
-    # but with their pre-orientation geo_info (we discarded the one
-    # read_to_array returned because it does not handle 2/3/4 today).
-    # Skip the GPU array remap on CPU-decoded paths to avoid a double
-    # flip, but always apply the geo_info update so coords match.
-    if orientation != 1:
-        if not arr_was_cpu_decoded:
-            arr_gpu = _apply_orientation_gpu(arr_gpu, orientation)
-        geo_info = _apply_orientation_geo_info(
-            geo_info, orientation, file_h=height, file_w=width)
-
-    _mw_mask_nodata = None
-    if (ifd.photometric == 0 and samples == 1 and not arr_was_cpu_decoded):
-        from ._reader import _miniswhite_inverted_nodata as _miw_inv_nd
-        gpu_dtype = np.dtype(str(arr_gpu.dtype))
-        # Compute the post-MinIsWhite sentinel BEFORE inverting the array,
-        # so the downstream ``_apply_nodata_mask_gpu`` call compares
-        # against the right value (#1809).
-        _mw_mask_nodata = _miw_inv_nd(geo_info.nodata, ifd, gpu_dtype)
-        if gpu_dtype.kind == 'u':
-            arr_gpu = np.iinfo(gpu_dtype).max - arr_gpu
-        elif gpu_dtype.kind == 'f':
-            arr_gpu = -arr_gpu
-
-    # Apply nodata mask + record sentinel so the GPU read agrees with the
-    # CPU eager path (issue #1542). Without this, integer rasters keep the
-    # literal sentinel value and float rasters keep the sentinel rather
-    # than NaN -- a silent backend divergence. Apply before the optional
-    # dtype cast so the float promotion for masked integer rasters doesn't
-    # surprise a user-supplied dtype.
-    nodata = geo_info.nodata
-    if nodata is not None:
-        # When MinIsWhite was applied, the mask must use the inverted
-        # sentinel; otherwise the original sentinel. The pure GPU path
-        # records the inverted sentinel in ``_mw_mask_nodata`` above; the
-        # CPU-fallback paths (sparse-tile, planar=2 auto-fallback, and
-        # post-decode CPU fallback) get it from ``read_to_array`` via
-        # ``_cpu_fallback_geo._mask_nodata`` (Copilot review of #1817).
-        if _mw_mask_nodata is not None:
-            _gpu_mask_value = _mw_mask_nodata
-        elif _cpu_fallback_geo is not None:
-            _gpu_mask_value = getattr(
-                _cpu_fallback_geo, '_mask_nodata', nodata)
-        else:
-            _gpu_mask_value = nodata
-        arr_gpu = _apply_nodata_mask_gpu(arr_gpu, _gpu_mask_value)
-
-    if dtype is not None:
-        target = np.dtype(dtype)
-        _validate_dtype_cast(np.dtype(str(arr_gpu.dtype)), target)
-        arr_gpu = arr_gpu.astype(target)
-
-    # Build DataArray
-    if name is None:
-        import os
-        name = os.path.splitext(os.path.basename(source))[0]
-
-    attrs = {}
-    _populate_attrs_from_geo_info(attrs, geo_info, window=window)
-    if nodata is not None:
-        attrs['nodata'] = nodata
-
-    # Apply window/band slicing post-decode. Coords are derived from the
-    # sliced array so the (y, x) labels line up with the user's requested
-    # subrectangle. This mirrors the ``open_geotiff`` / ``read_geotiff_dask``
-    # contract: ``attrs['transform']`` always carries the full-source
-    # GeoTransform shifted to the window origin (via
-    # ``_populate_attrs_from_geo_info(..., window=window)``), while
-    # ``coords['y']`` / ``coords['x']`` cover only the windowed cells.
-    arr_gpu, coords = _gpu_apply_window_band(
-        arr_gpu, geo_info, window=window, band=band)
-
-    if arr_gpu.ndim == 3:
-        dims = ['y', 'x', 'band']
-        coords['band'] = np.arange(arr_gpu.shape[2])
-    else:
-        dims = ['y', 'x']
-
-    result = xr.DataArray(arr_gpu, dims=dims, coords=coords,
-                          name=name, attrs=attrs)
-
-    # ``chunks=`` is handled at function entry via
-    # ``_read_geotiff_gpu_chunked`` for real out-of-core support; this
-    # eager path always returns a non-chunked CuPy-backed DataArray.
-
-    return result
-
-
-def _gds_chunk_path_available(source, ifd, has_sparse_tile, orientation):
-    """Return True iff a direct-to-GPU per-chunk decode is possible.
-
-    The disk->GPU per-chunk path requires:
-
-    - KvikIO present (so ``_try_kvikio_read_tiles`` can DMA tiles to VRAM).
-    - A local file path (no HTTP/fsspec source).
-    - A tiled layout (no strip-only file).
-    - PlanarConfiguration=1 (chunky); planar=2 would need per-band tile
-      grids and per-band crops.
-    - No sparse tiles, since the GPU decoders skip the bytes-zero-fill
-      handling the CPU reader does for them.
-    - Orientation == 1, since a non-default orientation needs the full
-      array on hand to apply the transform.
-    - PhotometricInterpretation != 0 (MinIsWhite needs an inversion
-      pass that lives in the eager path).
-    """
-    if not isinstance(source, str):
-        return False
-    if source.startswith(('http://', 'https://')):
-        return False
-    try:
-        from ._reader import _is_fsspec_uri
-        if _is_fsspec_uri(source):
-            return False
-    except Exception:
-        pass
-    try:
-        import importlib.util
-        if importlib.util.find_spec('kvikio') is None:
-            return False
-    except Exception:
-        return False
-    if not ifd.is_tiled:
-        return False
-    if has_sparse_tile:
-        return False
-    if ifd.planar_config != 1:
-        return False
-    if orientation != 1:
-        return False
-    if ifd.photometric == 0:
-        return False
-    return True
-
-
-def _decode_window_gpu_direct(file_path, all_offsets, all_byte_counts,
-                              tw, th, full_w, compression, predictor,
-                              file_dtype, samples, byte_order,
-                              r0, c0, r1, c1):
-    """Decode a window's tile subset disk->GPU.
-
-    Picks just the tiles overlapping ``(r0..r1, c0..c1)`` from the full
-    tile sequence, runs them through ``gpu_decode_tiles_from_file``
-    (which tries KvikIO GDS first, then a CPU mmap + ``gpu_decode_tiles``
-    fallback), and crops the assembled sub-image to the requested window.
-    Peak device memory is the sub-tile-grid bounding box, not the full
-    raster.
-
-    Called from inside a ``dask.delayed`` per-chunk task, so it runs
-    once per chunk and only pulls the tiles that chunk needs from disk.
-    """
-    from ._gpu_decode import gpu_decode_tiles, gpu_decode_tiles_from_file
-
-    tiles_across = (full_w + tw - 1) // tw
-
-    ty_start = r0 // th
-    ty_end = (r1 - 1) // th + 1
-    tx_start = c0 // tw
-    tx_end = (c1 - 1) // tw + 1
-
-    sub_tiles_across = tx_end - tx_start
-    sub_tiles_down = ty_end - ty_start
-    sub_h = sub_tiles_down * th
-    sub_w = sub_tiles_across * tw
-
-    indices = [ty * tiles_across + tx
-               for ty in range(ty_start, ty_end)
-               for tx in range(tx_start, tx_end)]
-    sub_offsets = [all_offsets[i] for i in indices]
-    sub_byte_counts = [all_byte_counts[i] for i in indices]
-
-    arr_gpu = gpu_decode_tiles_from_file(
-        file_path, sub_offsets, sub_byte_counts,
-        tw, th, sub_w, sub_h,
-        compression, predictor, file_dtype, samples,
-        byte_order=byte_order,
-    )
-
-    if arr_gpu is None:
-        # ``gpu_decode_tiles_from_file`` returns None when KvikIO is not
-        # usable on the host. Open the file via mmap, slice out just the
-        # bytes for these tiles, and run the GPU decoder on those.
-        from ._reader import _FileSource
-        src = _FileSource(file_path)
-        try:
-            data = src.read_all()
-            compressed_tiles = [
-                bytes(data[sub_offsets[i]:sub_offsets[i] + sub_byte_counts[i]])
-                for i in range(len(sub_offsets))
-            ]
-        finally:
-            src.close()
-        arr_gpu = gpu_decode_tiles(
-            compressed_tiles, tw, th, sub_w, sub_h,
-            compression, predictor, file_dtype, samples,
-            byte_order=byte_order,
-        )
-
-    crop_r0 = r0 - ty_start * th
-    crop_c0 = c0 - tx_start * tw
-    crop_r1 = crop_r0 + (r1 - r0)
-    crop_c1 = crop_c0 + (c1 - c0)
-    if samples > 1:
-        return arr_gpu[crop_r0:crop_r1, crop_c0:crop_c1, :]
-    return arr_gpu[crop_r0:crop_r1, crop_c0:crop_c1]
-
-
-def _read_geotiff_gpu_chunked(source, *, dtype, chunks, overview_level,
-                              window, band, name, max_pixels):
-    """Lazy Dask+CuPy backend for ``read_geotiff_gpu(chunks=...)``.
-
-    Two paths produce the same shape of dask graph:
-
-    1. **Direct disk->GPU** when KvikIO is available and the file is a
-       local, tiled, chunky GeoTIFF with no sparse tiles and a trivial
-       orientation. Each chunk task picks the tile subset for its
-       window, DMA's those tiles to the device via GDS, decodes on the
-       GPU, and crops. Peak GPU memory is one chunk and the file bytes
-       never traverse host memory.
-
-    2. **CPU decode + GPU upload** for everything else (HTTP / fsspec,
-       no KvikIO, planar=2, sparse, MinIsWhite, non-trivial orientation,
-       stripped layouts). Reuses ``read_geotiff_dask`` to build the
-       per-chunk windowed delayed graph and ``map_blocks(cupy.asarray)``
-       to upload each block. Peak GPU memory is still one chunk; the
-       cost is per-chunk CPU decode rather than GDS DMA.
-
-    Both paths are real out-of-core for device memory.
-    """
-    import cupy
-    import dask
-    import dask.array as da_mod
-
-    from ._reader import _FileSource, _coerce_path
-    from ._header import parse_header, parse_all_ifds, select_overview_ifd
-    from ._dtypes import resolve_bits_per_sample, tiff_dtype_to_numpy
-    from ._geotags import extract_geo_info_with_overview_inheritance
-
-    src_path = _coerce_path(source)
-
-    # Try the disk->GPU path. Parse metadata once; if the file does not
-    # qualify, fall through to the CPU-decode path. Any unexpected
-    # exception during the qualification probe also falls through so we
-    # never lose the ability to return a result.
-    try:
-        if isinstance(src_path, str) and not src_path.startswith(
-                ('http://', 'https://')):
-            fs = _FileSource(src_path)
-            try:
-                raw = fs.read_all()
-            finally:
-                fs.close()
-            header = parse_header(raw)
-            ifds = parse_all_ifds(raw, header)
-            if not ifds:
-                raise ValueError("No IFDs found in TIFF file")
-            ifd = select_overview_ifd(ifds, overview_level)
-            geo_info = extract_geo_info_with_overview_inheritance(
-                ifd, ifds, raw, header.byte_order,
-            )
-            orientation = ifd.orientation
-            has_sparse_tile = (
-                ifd.tile_byte_counts is not None
-                and any(bc == 0 for bc in ifd.tile_byte_counts)
-            )
-            if _gds_chunk_path_available(
-                    src_path, ifd, has_sparse_tile, orientation):
-                return _read_geotiff_gpu_chunked_gds(
-                    src_path, ifd, geo_info, header,
-                    dtype=dtype, chunks=chunks, window=window, band=band,
-                    name=name, max_pixels=max_pixels,
-                )
-    except Exception:
-        # GDS qualification failed; fall back to the CPU path. The
-        # error would otherwise be unrelated to what the user asked
-        # for (the CPU path re-parses metadata anyway).
-        pass
-
-    cpu_da = read_geotiff_dask(
-        source, dtype=dtype, chunks=chunks,
-        overview_level=overview_level, window=window, band=band,
-        max_pixels=max_pixels, name=name,
-    )
-
-    cpu_dask_arr = cpu_da.data
-
-    def _upload(block):
-        return cupy.asarray(block)
-
-    gpu_dask_arr = cpu_dask_arr.map_blocks(
-        _upload,
-        dtype=cpu_dask_arr.dtype,
-        meta=cupy.empty((0,) * cpu_dask_arr.ndim, dtype=cpu_dask_arr.dtype),
-    )
-
-    return xr.DataArray(
-        gpu_dask_arr, dims=cpu_da.dims, coords=cpu_da.coords,
-        name=cpu_da.name, attrs=dict(cpu_da.attrs),
-    )
-
-
-def _read_geotiff_gpu_chunked_gds(source, ifd, geo_info, header, *,
-                                  dtype, chunks, window, band, name,
-                                  max_pixels):
-    """Build a Dask+CuPy graph that decodes each chunk disk->GPU.
-
-    Caller must have verified that the source qualifies via
-    ``_gds_chunk_path_available``. Each chunk task pulls only the tile
-    subset overlapping its window via KvikIO GDS (or an mmap fallback
-    inside ``gpu_decode_tiles_from_file``) and crops on device.
-    """
-    import cupy
-    import dask
-    import dask.array as da_mod
-
-    from ._reader import _check_dimensions, MAX_PIXELS_DEFAULT
-    from ._header import validate_tile_layout
-    from ._dtypes import resolve_bits_per_sample, tiff_dtype_to_numpy
-
-    if max_pixels is None:
-        max_pixels = MAX_PIXELS_DEFAULT
-
-    full_h = ifd.height
-    full_w = ifd.width
-    samples = ifd.samples_per_pixel
-    bps = resolve_bits_per_sample(ifd.bits_per_sample)
-    file_dtype = tiff_dtype_to_numpy(bps, ifd.sample_format)
-    tw = ifd.tile_width
-    th = ifd.tile_height
-    compression = ifd.compression
-    predictor = ifd.predictor
-    byte_order = header.byte_order
-    offsets = list(ifd.tile_offsets)
-    byte_counts = list(ifd.tile_byte_counts)
-
-    _check_dimensions(full_w, full_h, samples, max_pixels)
-    _check_dimensions(tw, th, samples, max_pixels)
-    validate_tile_layout(ifd)
-
-    # Window restricts the visible region; offsets are computed relative
-    # to the windowed origin so chunks line up with the user's request.
-    if window is not None:
-        w_r0, w_c0, w_r1, w_c1 = window
-        if (w_r0 < 0 or w_c0 < 0 or w_r1 > full_h or w_c1 > full_w
-                or w_r0 >= w_r1 or w_c0 >= w_c1):
-            raise ValueError(
-                f"window={window} is out of bounds for image "
-                f"{full_w}x{full_h}.")
-        out_h, out_w = w_r1 - w_r0, w_c1 - w_c0
-        win_r0, win_c0 = w_r0, w_c0
-    else:
-        out_h, out_w = full_h, full_w
-        win_r0, win_c0 = 0, 0
-
-    if isinstance(chunks, int):
-        ch_h = ch_w = chunks
-    else:
-        ch_h, ch_w = chunks
-    if ch_h <= 0 or ch_w <= 0:
-        raise ValueError(f"Invalid chunks: {chunks}")
-
-    # Validate band kwarg against the file's band count.
-    n_bands_out = samples if samples > 1 else 0
-    if band is not None:
-        if n_bands_out == 0:
-            if band != 0:
-                raise IndexError(
-                    f"band={band} requested but file is single-band.")
-        elif band < 0 or band >= n_bands_out:
-            raise IndexError(
-                f"band={band} out of range for {n_bands_out}-band file.")
-
-    # Wrap the big tile-offset/byte-count tuples in a single Delayed so
-    # every chunk task takes them as one graph input rather than burning
-    # them into every task's pickled closure.
-    meta_key = dask.delayed(
-        (offsets, byte_counts), pure=True,
-    )
-
-    nodata = geo_info.nodata
-
-    @dask.delayed
-    def _chunk_task(meta, r0, c0, r1, c1):
-        all_offsets, all_byte_counts = meta
-        arr = _decode_window_gpu_direct(
-            source, all_offsets, all_byte_counts,
-            tw, th, full_w, compression, predictor,
-            file_dtype, samples, byte_order,
-            r0, c0, r1, c1,
-        )
-        if nodata is not None:
-            arr = _apply_nodata_mask_gpu(arr, nodata)
-        if dtype is not None:
-            target = np.dtype(dtype)
-            _validate_dtype_cast(np.dtype(str(arr.dtype)), target)
-            arr = arr.astype(target)
-        if band is not None and arr.ndim == 3:
-            arr = arr[:, :, band]
-        return arr
-
-    # Determine declared dtype for the dask graph. Nodata masking
-    # promotes integer rasters to float64; mirror the CPU dask path.
-    declared_dtype = file_dtype
-    if nodata is not None and file_dtype.kind in ('u', 'i'):
-        if np.isfinite(nodata) and float(nodata).is_integer():
-            info = np.iinfo(file_dtype)
-            if info.min <= int(nodata) <= info.max:
-                declared_dtype = np.dtype('float64')
-    if dtype is not None:
-        declared_dtype = np.dtype(dtype)
-
-    out_has_band_axis = band is None and n_bands_out > 0
-
-    blocks_rows = []
-    for r0 in range(0, out_h, ch_h):
-        r1 = min(r0 + ch_h, out_h)
-        blocks_cols = []
-        for c0 in range(0, out_w, ch_w):
-            c1 = min(c0 + ch_w, out_w)
-            if out_has_band_axis:
-                block_shape = (r1 - r0, c1 - c0, n_bands_out)
-            else:
-                block_shape = (r1 - r0, c1 - c0)
-            # Convert chunk coords to file-space coords.
-            block = da_mod.from_delayed(
-                _chunk_task(meta_key,
-                            r0 + win_r0, c0 + win_c0,
-                            r1 + win_r0, c1 + win_c0),
-                shape=block_shape,
-                dtype=declared_dtype,
-                meta=cupy.empty((0,) * len(block_shape),
-                                dtype=declared_dtype),
-            )
-            blocks_cols.append(block)
-        blocks_rows.append(da_mod.concatenate(blocks_cols, axis=1))
-    dask_arr = da_mod.concatenate(blocks_rows, axis=0)
-
-    # Build coords/attrs that match read_geotiff_dask's output.
-    coords = _coords_from_geo_info(geo_info, out_h, out_w, window=window)
-    if out_has_band_axis:
-        dims = ['y', 'x', 'band']
-        coords['band'] = np.arange(n_bands_out)
-    else:
-        dims = ['y', 'x']
-
-    attrs = {}
-    _populate_attrs_from_geo_info(attrs, geo_info, window=window)
-    if nodata is not None:
-        attrs['nodata'] = nodata
-
-    if name is None:
-        import os
-        name = os.path.splitext(os.path.basename(source))[0]
-
-    return xr.DataArray(
-        dask_arr, dims=dims, coords=coords, name=name, attrs=attrs,
-    )
-
 
 def write_geotiff_gpu(data: xr.DataArray | cupy.ndarray | np.ndarray,
                       path: str | BinaryIO, *,
diff --git a/xrspatial/geotiff/_backends/gpu.py b/xrspatial/geotiff/_backends/gpu.py
new file mode 100644
index 00000000..84ab09a3
--- /dev/null
+++ b/xrspatial/geotiff/_backends/gpu.py
@@ -0,0 +1,1047 @@
+"""GPU read backend: ``read_geotiff_gpu`` and its private helpers.
+
+Step 7 of issue #1813. With the leaf helpers in ``_backends/_gpu_helpers``
+already extracted (#1884), validators in ``_validation`` (#1882), attrs
+helpers in ``_attrs`` (#1883), and sentinels in ``_runtime`` (#1880),
+moving the entry-point body is a near-mechanical lift; the body stays
+unchanged except for adjusted relative imports and a lazy
+``read_geotiff_dask`` import that avoids a circular import with
+``__init__.py``.
+"""
+from __future__ import annotations
+
+import math
+import warnings
+
+import numpy as np
+import xarray as xr
+
+from .._attrs import _populate_attrs_from_geo_info
+from .._coords import (
+    coords_from_geo_info as _coords_from_geo_info,
+)
+from .._reader import read_to_array as _read_to_array
+from .._runtime import (
+    _GPU_DEPRECATED_SENTINEL,
+    _ON_GPU_FAILURE_SENTINEL,
+    _geotiff_strict_mode,
+)
+from .._validation import _validate_chunks_arg, _validate_dtype_cast
+from ._gpu_helpers import (
+    _apply_nodata_mask_gpu,
+    _apply_orientation_geo_info,
+    _apply_orientation_gpu,
+    _gpu_apply_window_band,
+    _gpu_decode_single_band_tiles,
+)
+
+
+def read_geotiff_gpu(source: str, *,
+                     dtype: str | np.dtype | None = None,
+                     overview_level: int | None = None,
+                     window: tuple | None = None,
+                     band: int | None = None,
+                     name: str | None = None,
+                     chunks: int | tuple | None = None,
+                     max_pixels: int | None = None,
+                     on_gpu_failure: str = _ON_GPU_FAILURE_SENTINEL,
+                     gpu: str = _GPU_DEPRECATED_SENTINEL,
+                     ) -> xr.DataArray:
+    """Read a GeoTIFF with GPU-accelerated decompression via Numba CUDA.
+
+    Decompresses all tiles in parallel on the GPU and returns a
+    CuPy-backed DataArray that stays on device memory. No CPU->GPU
+    transfer needed for downstream xrspatial GPU operations.
+
+    With ``chunks=``, returns a Dask+CuPy DataArray with real
+    out-of-core memory bounds: each chunk reads only the tiles for its
+    window (via the CPU dask path) and uploads the result to the
+    device, so peak GPU memory is one chunk rather than the whole
+    raster. The trade-off is per-chunk CPU decode rather than bulk GPU
+    decode; for rasters that fit on the device, ``chunks=None`` keeps
+    the full GPU-decode fast path.
+
+    Requires: cupy, numba with CUDA support.
+
+    Parameters
+    ----------
+    source : str
+        File path.
+    dtype : str, numpy.dtype, or None
+        Cast the result to this dtype after reading. None keeps the
+        file's native dtype. Float-to-int casts raise ValueError, mirroring
+        ``open_geotiff`` / ``read_geotiff_dask``.
+    overview_level : int or None
+        Overview level (0 = full resolution).
+    window : tuple or None
+        ``(row_start, col_start, row_stop, col_stop)`` for windowed
+        reading. None reads the full raster. The GPU pipeline currently
+        decodes all tiles and slices on device after assembly, so the
+        kwarg restores API parity with ``open_geotiff`` and
+        ``read_geotiff_dask`` but does not yet skip I/O for partial
+        windows. The returned coords, ``attrs['transform']``, and
+        shape match the eager numpy path.
+    band : int or None
+        Zero-based band index. None returns all bands (3D output for
+        multi-band files, 2D for single-band). Selecting a single band
+        yields a 2D DataArray.
+    chunks : int, tuple, or None
+        If set, return a Dask-chunked CuPy DataArray decoded one chunk
+        at a time. int for square chunks, (row, col) tuple for
+        rectangular. Each chunk task reads only the tiles overlapping
+        its window (CPU decode) and uploads the result to the device,
+        so peak GPU memory is bounded by chunk size. ``chunks=None``
+        (default) decodes the full raster on the GPU in one pass.
+    name : str or None
+        Name for the DataArray.
+    max_pixels : int or None
+        Maximum allowed pixel count (width * height * samples). None
+        uses the default (~1 billion).
+    on_gpu_failure : {'auto', 'strict'}, default 'auto'
+        Behaviour when any GPU decode stage raises an exception.
+
+        The GPU pipeline has two stages: first ``gpu_decode_tiles_from_file``
+        (GDS-style direct read), then ``gpu_decode_tiles`` over CPU-mmap
+        extracted tile bytes. Both stages still run on the GPU. The CPU
+        fallback (``read_to_array`` + ``cupy.asarray``) only fires after
+        both GPU stages have failed.
+
+        - ``'auto'``: each GPU-stage failure emits a ``RuntimeWarning``
+          reporting the original exception type and message, then falls
+          through to the next stage (CPU mmap re-decode for the first
+          failure, full CPU decode + GPU transfer for the second). This
+          preserves backward-compatible behaviour while making GPU
+          regressions visible.
+        - ``'strict'``: re-raise the original exception from either stage
+          so GPU bugs surface immediately. Useful in tests and CI for the
+          GPU fast path.
+
+        Stripped layouts and sparse-tile files route directly to the CPU
+        reader before either GPU decode stage runs, so the ``on_gpu_failure``
+        kwarg does not affect them. A failure inside the subsequent
+        ``cupy.asarray(...)`` upload propagates unchanged in both modes.
+    gpu : str, optional
+        Deprecated alias for ``on_gpu_failure``. Emits ``DeprecationWarning``
+        when used. Passing both ``gpu`` and ``on_gpu_failure`` raises
+        ``TypeError``. The old name shipped with values ``'auto'`` /
+        ``'strict'`` and was easy to confuse with the boolean ``gpu=``
+        kwarg on ``open_geotiff`` / ``to_geotiff`` / ``read_vrt``.
+
+    Returns
+    -------
+    xr.DataArray
+        CuPy-backed DataArray on GPU device.
+    """
+    new_passed = on_gpu_failure is not _ON_GPU_FAILURE_SENTINEL
+    old_passed = gpu is not _GPU_DEPRECATED_SENTINEL
+    if new_passed and old_passed:
+        # Both supplied is ambiguous regardless of which values were
+        # chosen (including the matching ``on_gpu_failure='auto',
+        # gpu='auto'`` pair). Refuse rather than silently picking one.
+        raise TypeError(
+            "read_geotiff_gpu: pass either 'on_gpu_failure' or the "
+            "deprecated 'gpu' alias, not both.")
+    if old_passed:
+        warnings.warn(
+            "read_geotiff_gpu(..., gpu=...) is deprecated; use "
+            "on_gpu_failure=... instead. The kwarg was renamed because "
+            "'gpu' on open_geotiff/to_geotiff/read_vrt is a bool that "
+            "selects the GPU backend, while here it selects the failure "
+            "policy when the GPU path raises.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        on_gpu_failure = gpu
+    elif not new_passed:
+        on_gpu_failure = 'auto'
+    gpu = on_gpu_failure
+    if gpu not in ('auto', 'strict'):
+        raise ValueError(
+            f"on_gpu_failure must be 'auto' or 'strict', got {gpu!r}")
+    # Reject non-positive chunk sizes up front so the GPU dask+cupy path
+    # surfaces the same error as ``read_geotiff_dask`` (#1776). Previously
+    # ``chunks=0`` raised ``ZeroDivisionError`` deep in cupy/dask, and
+    # ``chunks=-1`` was silently accepted (negative chunks fall out of
+    # the dask chunk grid as a no-op). ``chunks=None`` is the default
+    # (eager read), so allow it through here.
+    chunks = _validate_chunks_arg(chunks, allow_none=True)
+    try:
+        import cupy
+    except ImportError:
+        raise ImportError(
+            "cupy is required for GPU reads. "
+            "Install it with: pip install cupy-cuda12x")
+
+    # When ``chunks=`` is set, bound peak GPU memory to chunk size by
+    # building a Dask+CuPy graph that decodes one chunk at a time. The
+    # CPU dask path already lays out a window-per-chunk delayed graph
+    # (parses TIFF metadata once, decodes only the tiles overlapping
+    # each chunk window, handles HTTP/fsspec/local/sparse/planar=2/
+    # MinIsWhite/nodata/orientation). Reusing it and uploading each
+    # block to the device via ``map_blocks(cupy.asarray)`` gives real
+    # out-of-core behaviour for the read; the trade-off is per-chunk
+    # CPU decode rather than the eager path's bulk GPU decode. Users
+    # who want full GPU-side decode (and have device memory for the
+    # whole image) pass ``chunks=None``. See issue #1876.
+    if chunks is not None:
+        return _read_geotiff_gpu_chunked(
+            source, dtype=dtype, chunks=chunks,
+            overview_level=overview_level, window=window, band=band,
+            name=name, max_pixels=max_pixels,
+        )
+
+    from .._reader import (
+        _FileSource, _check_dimensions, MAX_PIXELS_DEFAULT, _coerce_path,
+        _resolve_masked_fill,
+    )
+    from .._compression import COMPRESSION_LERC
+    from .._header import (
+        parse_header, parse_all_ifds, select_overview_ifd, validate_tile_layout,
+    )
+    from .._dtypes import resolve_bits_per_sample, tiff_dtype_to_numpy
+    from .._geotags import extract_geo_info_with_overview_inheritance
+    from .._gpu_decode import gpu_decode_tiles
+
+    source = _coerce_path(source)
+
+    if max_pixels is None:
+        max_pixels = MAX_PIXELS_DEFAULT
+
+    # Window basic shape check happens here; full bounds-vs-file validation
+    # runs after the IFD parse below so we can compare against the chosen
+    # overview level's actual height/width. ``band`` is similarly validated
+    # against ``ifd.samples_per_pixel`` after the header parse.
+    if window is not None:
+        if len(window) != 4:
+            raise ValueError(
+                f"window must be a 4-tuple (r0, c0, r1, c1), got {window!r}")
+        w_r0, w_c0, w_r1, w_c1 = window
+        if w_r0 >= w_r1 or w_c0 >= w_c1 or w_r0 < 0 or w_c0 < 0:
+            raise ValueError(
+                f"window={window} has non-positive size or negative origin.")
+
+    # Parse metadata on CPU (fast, <1ms)
+    src = _FileSource(source)
+    data = src.read_all()
+
+    try:
+        header = parse_header(data)
+        ifds = parse_all_ifds(data, header)
+
+        if len(ifds) == 0:
+            raise ValueError("No IFDs found in TIFF file")
+
+        # Skip mask IFDs (NewSubfileType bit 2)
+        ifd = select_overview_ifd(ifds, overview_level)
+
+        bps = resolve_bits_per_sample(ifd.bits_per_sample)
+        file_dtype = tiff_dtype_to_numpy(bps, ifd.sample_format)
+        # Inherit georef from the level-0 IFD when the overview itself
+        # has no geokeys (issue #1640); pass-through for level 0.
+        geo_info = extract_geo_info_with_overview_inheritance(
+            ifd, ifds, data, header.byte_order)
+        # Capture the Orientation tag (274) once so the post-decode flip
+        # below picks it up for both the stripped fallback and the tiled
+        # GPU pipelines. CPU read_to_array applies the array remap +
+        # transform update for stripped reads, so for that branch we
+        # only need to copy the post-flip geo_info back here.
+        orientation = ifd.orientation
+
+        # Orientation tag (274): values 2-8 mean the stored pixel order
+        # differs from display order. A windowed read against a non-default
+        # orientation has ambiguous semantics (does the window refer to
+        # file pixels or display pixels?), so the CPU reader
+        # ``_reader.read_to_array`` rejects ``window=`` for orientation != 1.
+        # Mirror that here so the GPU path agrees with the CPU path and
+        # ``read_geotiff_dask``. Use the same error wording so the failure
+        # message is identical across backends.
+        if orientation != 1 and window is not None:
+            raise ValueError(
+                f"Orientation tag (274) is {orientation}; windowed reads "
+                f"(window=...) and dask-chunked reads (chunks=...) are not "
+                f"supported for non-default orientation. Read the full "
+                f"array first, then slice."
+            )
+
+        # Validate band against the selected IFD's sample count.
+        # ``samples_per_pixel`` is at least 1 for any valid TIFF; we treat
+        # ``band=0`` as "first band" for single-band files too so the
+        # behaviour mirrors ``read_geotiff_dask``.
+        ifd_samples = ifd.samples_per_pixel
+        if band is not None:
+            # Reject ``bool`` and ``np.bool_`` up front;
+            # ``isinstance(True, int)`` is True in Python so
+            # ``True < ifd_samples`` evaluates without raising and silently
+            # reads band 1. ``np.bool_`` is not a subclass of ``bool`` so it
+            # needs its own check to match the VRT path's rejection.
+            # See #1786.
+            if isinstance(band, (bool, np.bool_)):
+                raise ValueError(
+                    f"band must be a non-negative int, got {band!r}")
+            if ifd_samples <= 1:
+                if band != 0:
+                    raise IndexError(
+                        f"band={band} requested on a single-band file.")
+            elif not 0 <= band < ifd_samples:
+                raise IndexError(
+                    f"band={band} out of range for {ifd_samples}-band file.")
+
+        # Validate window upper bounds against the selected IFD's extent.
+        if window is not None:
+            w_r0, w_c0, w_r1, w_c1 = window
+            ifd_h, ifd_w = ifd.height, ifd.width
+            if w_r1 > ifd_h or w_c1 > ifd_w:
+                raise ValueError(
+                    f"window={window} is outside the source extent "
+                    f"({ifd_h}x{ifd_w}).")
+
+        if not ifd.is_tiled:
+            # Fall back to CPU for stripped files. read_to_array remaps
+            # the array but only updates geo_info.transform for orientations
+            # 5-8 today (the 2/3/4 fix in #1539 is in a sibling PR). Discard
+            # its geo_info and apply our own transform update below so the
+            # result is correct regardless of merge order.
+            #
+            # Forward ``max_pixels``, ``window``, and ``band`` so the
+            # caller's safety cap is honoured, windowed reads avoid
+            # decoding the full image, and single-band selection on a
+            # multi-band source skips the unused channels. Without this,
+            # the stripped GPU path bypassed all three (issue #1732).
+            # Orientation != 1 + window is already rejected at line 2495,
+            # so ``window`` is None whenever ``geo_info`` will be remapped
+            # below.
+            src.close()
+            arr_cpu, _stripped_geo = _read_to_array(
+                source, overview_level=overview_level,
+                window=window, band=band, max_pixels=max_pixels)
+            arr_gpu = cupy.asarray(arr_cpu)
+            if orientation != 1:
+                geo_info = _apply_orientation_geo_info(
+                    geo_info, orientation,
+                    file_h=ifd.height, file_w=ifd.width)
+            if name is None:
+                import os
+                name = os.path.splitext(os.path.basename(source))[0]
+            attrs = {}
+            _populate_attrs_from_geo_info(attrs, geo_info, window=window)
+            # Apply nodata mask + record sentinel so the GPU read agrees
+            # with the CPU eager path. Without this, integer rasters keep
+            # the literal sentinel value and float rasters keep the
+            # sentinel rather than NaN -- a silent backend divergence.
+            # ``read_to_array`` stashes the post-MinIsWhite sentinel on
+            # ``_mask_nodata`` when applicable; fall back to the original
+            # sentinel otherwise (#1809).
+            nodata = geo_info.nodata
+            if nodata is not None:
+                attrs['nodata'] = nodata
+                mask_value = getattr(_stripped_geo, '_mask_nodata', nodata)
+                arr_gpu = _apply_nodata_mask_gpu(arr_gpu, mask_value)
+            if dtype is not None:
+                target = np.dtype(dtype)
+                _validate_dtype_cast(np.dtype(str(arr_gpu.dtype)), target)
+                arr_gpu = arr_gpu.astype(target)
+            # ``read_to_array`` already applied window + band slicing, so
+            # ``arr_gpu`` is at output shape. Compute coords for that
+            # shape without re-slicing. Mirror the eager-numpy /
+            # ``read_geotiff_dask`` / ``_gpu_apply_window_band`` checks
+            # against ``has_georef``: a non-georef TIFF carries a
+            # default ``GeoTransform()`` placeholder (``t is None`` is
+            # never true here) so a transform-based coord path would
+            # emit synthetic ``[-0.5, -1.5, ...]`` floats instead of
+            # the integer pixel coords every other backend produces
+            # (#1753 / regression of #1710).
+            coords = _coords_from_geo_info(
+                geo_info, arr_gpu.shape[0], arr_gpu.shape[1], window=window,
+            )
+            # Multi-band stripped reads come back as (y, x, band); mirror
+            # the tiled branch so dims line up with ndim. Single-band stays
+            # 2-D ('y', 'x').
+            if arr_gpu.ndim == 3:
+                dims = ['y', 'x', 'band']
+                coords['band'] = np.arange(arr_gpu.shape[2])
+            else:
+                dims = ['y', 'x']
+            result = xr.DataArray(arr_gpu, dims=dims,
+                                  coords=coords, name=name, attrs=attrs)
+            # ``chunks`` was previously honoured only on the tiled path,
+            # so stripped TIFFs returned an unchunked DataArray even when
+            # the caller asked for a Dask+CuPy result. Mirror the tiled
+            # branch's chunking step so behaviour is consistent across
+            # layouts.
+            if chunks is not None:
+                if isinstance(chunks, int):
+                    chunk_dict = {'y': chunks, 'x': chunks}
+                else:
+                    chunk_dict = {'y': chunks[0], 'x': chunks[1]}
+                result = result.chunk(chunk_dict)
+            return result
+
+        offsets = ifd.tile_offsets
+        byte_counts = ifd.tile_byte_counts
+        compression = ifd.compression
+        predictor = ifd.predictor
+        samples = ifd.samples_per_pixel
+        planar = ifd.planar_config
+        tw = ifd.tile_width
+        th = ifd.tile_height
+        width = ifd.width
+        height = ifd.height
+
+        if tw <= 0 or th <= 0:
+            raise ValueError(
+                f"Invalid tile dimensions: TileWidth={tw}, TileLength={th}")
+
+        _check_dimensions(width, height, samples, max_pixels)
+        # A single tile's decoded bytes must also fit under the pixel budget.
+        _check_dimensions(tw, th, samples, max_pixels)
+
+        # Reject malformed TIFFs whose declared tile grid exceeds the
+        # supplied TileOffsets length. The GPU tile-assembly kernel would
+        # read OOB otherwise. See issue #1219.
+        validate_tile_layout(ifd)
+
+    finally:
+        src.close()
+
+    # GPU decode: try GDS (SSD→GPU direct) first, then CPU mmap path.
+    # Sparse tiles (byte_count == 0) are unsupported on the GPU pipeline;
+    # the CPU reader fills them with nodata and copies onto the GPU.
+    has_sparse_tile = any(bc == 0 for bc in byte_counts)
+    # LERC tiles can carry a per-pixel valid mask that GDAL writes
+    # zero-filled in the data array.  Compute the nodata fill the same
+    # way the CPU reader does so the GPU decode path can restore it
+    # post-assembly (mirrors PR #1529 for the CPU path). Only the
+    # chunky (planar=1) GPU path threads masked_fill into its kernel
+    # call below; the planar=2 per-band branch falls back to the CPU
+    # reader for masked pixels (rare in practice -- LERC files
+    # typically use chunky layout).
+    masked_fill = (_resolve_masked_fill(ifd.nodata_str, file_dtype)
+                   if compression == COMPRESSION_LERC else None)
+
+    # Track whether the array we end up with was already orientation-flipped
+    # by `read_to_array`. Any path that falls back to CPU decode picks up
+    # the orientation remap from PR #1521 + #1537 for free; the pure GPU
+    # paths still need the explicit remap added in #1540.
+    arr_was_cpu_decoded = False
+    # When a CPU fallback runs, ``read_to_array`` has already applied the
+    # MinIsWhite inversion and stashed the post-inversion sentinel on
+    # ``_mask_nodata``. Keep that geo_info alongside the pre-extracted one
+    # so the downstream nodata mask compares against the correct value
+    # (Copilot review of #1817).
+    _cpu_fallback_geo = None
+
+    # PlanarConfiguration=2 (separate bands): each band has its own list
+    # of tiles back-to-back in TileOffsets / TileByteCounts. The GPU
+    # tile-assembly kernel assumes a single chunky tile sequence with
+    # bytes_per_pixel = itemsize * samples, so it cannot handle planar=2
+    # directly. Decode each band's tile slab as a single-band image, then
+    # stack into (H, W, samples). For planar=1 (chunky) the existing
+    # single-pass kernel is correct. Sparse-tile files always route to
+    # the CPU reader regardless of planar config.
+    if planar == 2 and samples > 1 and not has_sparse_tile:
+        tiles_across = math.ceil(width / tw)
+        tiles_down = math.ceil(height / th)
+        tiles_per_band = tiles_across * tiles_down
+        # validate_tile_layout already requires len(offsets) >= the grid;
+        # accept extra trailing entries (some writers emit padding) and
+        # only consume the first tiles_per_band * samples.
+        expected_min = tiles_per_band * samples
+        if len(offsets) < expected_min:
+            raise ValueError(
+                f"PlanarConfiguration=2 expects at least {expected_min} "
+                f"TileOffsets entries ({tiles_across} x {tiles_down} x "
+                f"{samples} bands), got {len(offsets)}"
+            )
+        # Lazy shared file read for the per-band stage-2 fallback. When
+        # every band's GDS path succeeds, _read_once is never called
+        # and we skip the read_all() entirely; when any band falls
+        # back, the first call materialises the bytes and subsequent
+        # bands reuse the same buffer (so N bands cost at most one
+        # read_all(), not N).
+        _shared_data_cache: list = []
+
+        def _read_once():
+            if not _shared_data_cache:
+                src2 = _FileSource(source)
+                try:
+                    _shared_data_cache.append(src2.read_all())
+                finally:
+                    src2.close()
+            return _shared_data_cache[0]
+
+        band_arrays = []
+        cpu_fallback_needed = False
+        for band_idx in range(samples):
+            b0 = band_idx * tiles_per_band
+            b1 = b0 + tiles_per_band
+            band_offsets = list(offsets[b0:b1])
+            band_byte_counts = list(byte_counts[b0:b1])
+            band_arr = _gpu_decode_single_band_tiles(
+                source, _read_once, band_offsets, band_byte_counts,
+                tw, th, width, height,
+                compression, predictor, file_dtype,
+                byte_order=header.byte_order,
+                gpu=gpu,
+            )
+            if band_arr is None:
+                # Auto-mode signal: stage-2 GPU decode failed for this
+                # band. There's no per-band CPU decode path, so fall
+                # back to a whole-image CPU read + GPU upload, matching
+                # the chunky path's auto-mode semantics.
+                cpu_fallback_needed = True
+                break
+            band_arrays.append(band_arr)
+        if cpu_fallback_needed:
+            # Drop read_to_array's geo_info for orientation transform
+            # handling (below operates on our pre-extracted geo_info so the
+            # 2/3/4 case is covered regardless of #1539's merge state), but
+            # keep it on ``_cpu_fallback_geo`` so the MinIsWhite-aware nodata
+            # mask below sees ``_mask_nodata``.
+            arr_cpu, _cpu_fallback_geo = _read_to_array(
+                source, overview_level=overview_level)
+            arr_gpu = cupy.asarray(arr_cpu)
+            arr_was_cpu_decoded = True
+        else:
+            arr_gpu = cupy.stack(band_arrays, axis=2)
+            if arr_gpu.shape != (height, width, samples):
+                raise RuntimeError(
+                    f"planar=2 GPU assembly produced shape "
+                    f"{arr_gpu.shape}, expected "
+                    f"({height}, {width}, {samples})"
+                )
+    elif has_sparse_tile:
+        arr_cpu, _cpu_fallback_geo = _read_to_array(
+            source, overview_level=overview_level)
+        arr_gpu = cupy.asarray(arr_cpu)
+        arr_was_cpu_decoded = True
+    else:
+        from .._gpu_decode import gpu_decode_tiles_from_file
+        arr_gpu = None
+
+        try:
+            arr_gpu = gpu_decode_tiles_from_file(
+                source, offsets, byte_counts,
+                tw, th, width, height,
+                compression, predictor, file_dtype, samples,
+                byte_order=header.byte_order,
+                masked_fill=masked_fill,
+            )
+        except Exception as e:
+            if gpu == 'strict' or _geotiff_strict_mode():
+                raise
+            warnings.warn(
+                f"read_geotiff_gpu: GPU decode failed "
+                f"({type(e).__name__}: {e}); falling back to CPU.",
+                RuntimeWarning,
+                stacklevel=2,
+            )
+            arr_gpu = None
+
+    if arr_gpu is None:
+        # Fallback: extract tiles via CPU mmap, then GPU decode
+        src2 = _FileSource(source)
+        data2 = src2.read_all()
+        try:
+            compressed_tiles = [
+                bytes(data2[offsets[i]:offsets[i] + byte_counts[i]])
+                for i in range(len(offsets))
+            ]
+        finally:
+            src2.close()
+
+    if arr_gpu is None:
+        try:
+            arr_gpu = gpu_decode_tiles(
+                compressed_tiles,
+                tw, th, width, height,
+                compression, predictor, file_dtype, samples,
+                byte_order=header.byte_order,
+                masked_fill=masked_fill,
+            )
+        except Exception as e:
+            if gpu == 'strict' or _geotiff_strict_mode():
+                raise
+            warnings.warn(
+                f"read_geotiff_gpu: GPU decode failed "
+                f"({type(e).__name__}: {e}); falling back to CPU.",
+                RuntimeWarning,
+                stacklevel=2,
+            )
+            arr_cpu, _cpu_fallback_geo = _read_to_array(
+                source, overview_level=overview_level)
+            arr_gpu = cupy.asarray(arr_cpu)
+            arr_was_cpu_decoded = True
+
+    # Multi-band tiled output must be (H, W, samples) regardless of planar
+    # config -- catch any shape regression in the kernels before we attach
+    # dims/coords below. Plain `raise` rather than `assert` so the check
+    # survives `python -O`.
+    if samples > 1:
+        if (arr_gpu.shape[:2] != (height, width)
+                or arr_gpu.shape[2] != samples):
+            raise RuntimeError(
+                f"GPU multi-band tile assembly produced shape "
+                f"{arr_gpu.shape}, expected "
+                f"({height}, {width}, {samples})"
+            )
+
+    # Apply the TIFF Orientation tag (274). The pure GPU paths land here
+    # with a raw stored-order buffer; the CPU-fallback paths land here
+    # with arr_gpu already remapped (read_to_array does the data flip)
+    # but with their pre-orientation geo_info (we discarded the one
+    # read_to_array returned because it does not handle 2/3/4 today).
+    # Skip the GPU array remap on CPU-decoded paths to avoid a double
+    # flip, but always apply the geo_info update so coords match.
+    if orientation != 1:
+        if not arr_was_cpu_decoded:
+            arr_gpu = _apply_orientation_gpu(arr_gpu, orientation)
+        geo_info = _apply_orientation_geo_info(
+            geo_info, orientation, file_h=height, file_w=width)
+
+    _mw_mask_nodata = None
+    if (ifd.photometric == 0 and samples == 1 and not arr_was_cpu_decoded):
+        from .._reader import _miniswhite_inverted_nodata as _miw_inv_nd
+        gpu_dtype = np.dtype(str(arr_gpu.dtype))
+        # Compute the post-MinIsWhite sentinel BEFORE inverting the array,
+        # so the downstream ``_apply_nodata_mask_gpu`` call compares
+        # against the right value (#1809).
+        _mw_mask_nodata = _miw_inv_nd(geo_info.nodata, ifd, gpu_dtype)
+        if gpu_dtype.kind == 'u':
+            arr_gpu = np.iinfo(gpu_dtype).max - arr_gpu
+        elif gpu_dtype.kind == 'f':
+            arr_gpu = -arr_gpu
+
+    # Apply nodata mask + record sentinel so the GPU read agrees with the
+    # CPU eager path (issue #1542). Without this, integer rasters keep the
+    # literal sentinel value and float rasters keep the sentinel rather
+    # than NaN -- a silent backend divergence. Apply before the optional
+    # dtype cast so the float promotion for masked integer rasters doesn't
+    # surprise a user-supplied dtype.
+    nodata = geo_info.nodata
+    if nodata is not None:
+        # When MinIsWhite was applied, the mask must use the inverted
+        # sentinel; otherwise the original sentinel. The pure GPU path
+        # records the inverted sentinel in ``_mw_mask_nodata`` above; the
+        # CPU-fallback paths (sparse-tile, planar=2 auto-fallback, and
+        # post-decode CPU fallback) get it from ``read_to_array`` via
+        # ``_cpu_fallback_geo._mask_nodata`` (Copilot review of #1817).
+        if _mw_mask_nodata is not None:
+            _gpu_mask_value = _mw_mask_nodata
+        elif _cpu_fallback_geo is not None:
+            _gpu_mask_value = getattr(
+                _cpu_fallback_geo, '_mask_nodata', nodata)
+        else:
+            _gpu_mask_value = nodata
+        arr_gpu = _apply_nodata_mask_gpu(arr_gpu, _gpu_mask_value)
+
+    if dtype is not None:
+        target = np.dtype(dtype)
+        _validate_dtype_cast(np.dtype(str(arr_gpu.dtype)), target)
+        arr_gpu = arr_gpu.astype(target)
+
+    # Build DataArray
+    if name is None:
+        import os
+        name = os.path.splitext(os.path.basename(source))[0]
+
+    attrs = {}
+    _populate_attrs_from_geo_info(attrs, geo_info, window=window)
+    if nodata is not None:
+        attrs['nodata'] = nodata
+
+    # Apply window/band slicing post-decode. Coords are derived from the
+    # sliced array so the (y, x) labels line up with the user's requested
+    # subrectangle. This mirrors the ``open_geotiff`` / ``read_geotiff_dask``
+    # contract: ``attrs['transform']`` always carries the full-source
+    # GeoTransform shifted to the window origin (via
+    # ``_populate_attrs_from_geo_info(..., window=window)``), while
+    # ``coords['y']`` / ``coords['x']`` cover only the windowed cells.
+    arr_gpu, coords = _gpu_apply_window_band(
+        arr_gpu, geo_info, window=window, band=band)
+
+    if arr_gpu.ndim == 3:
+        dims = ['y', 'x', 'band']
+        coords['band'] = np.arange(arr_gpu.shape[2])
+    else:
+        dims = ['y', 'x']
+
+    result = xr.DataArray(arr_gpu, dims=dims, coords=coords,
+                          name=name, attrs=attrs)
+
+    # ``chunks=`` is handled at function entry via
+    # ``_read_geotiff_gpu_chunked`` for real out-of-core support; this
+    # eager path always returns a non-chunked CuPy-backed DataArray.
+
+    return result
+
+
+def _gds_chunk_path_available(source, ifd, has_sparse_tile, orientation):
+    """Return True iff a direct-to-GPU per-chunk decode is possible.
+
+    The disk->GPU per-chunk path requires:
+
+    - KvikIO present (so ``_try_kvikio_read_tiles`` can DMA tiles to VRAM).
+    - A local file path (no HTTP/fsspec source).
+    - A tiled layout (no strip-only file).
+    - PlanarConfiguration=1 (chunky); planar=2 would need per-band tile
+      grids and per-band crops.
+    - No sparse tiles, since the GPU decoders skip the bytes-zero-fill
+      handling the CPU reader does for them.
+    - Orientation == 1, since a non-default orientation needs the full
+      array on hand to apply the transform.
+    - PhotometricInterpretation != 0 (MinIsWhite needs an inversion
+      pass that lives in the eager path).
+    """
+    if not isinstance(source, str):
+        return False
+    if source.startswith(('http://', 'https://')):
+        return False
+    try:
+        from .._reader import _is_fsspec_uri
+        if _is_fsspec_uri(source):
+            return False
+    except Exception:
+        pass
+    try:
+        import importlib.util
+        if importlib.util.find_spec('kvikio') is None:
+            return False
+    except Exception:
+        return False
+    if not ifd.is_tiled:
+        return False
+    if has_sparse_tile:
+        return False
+    if ifd.planar_config != 1:
+        return False
+    if orientation != 1:
+        return False
+    if ifd.photometric == 0:
+        return False
+    return True
+
+
+def _decode_window_gpu_direct(file_path, all_offsets, all_byte_counts,
+                              tw, th, full_w, compression, predictor,
+                              file_dtype, samples, byte_order,
+                              r0, c0, r1, c1):
+    """Decode a window's tile subset disk->GPU.
+
+    Picks just the tiles overlapping ``(r0..r1, c0..c1)`` from the full
+    tile sequence, runs them through ``gpu_decode_tiles_from_file``
+    (which tries KvikIO GDS first, then a CPU mmap + ``gpu_decode_tiles``
+    fallback), and crops the assembled sub-image to the requested window.
+    Peak device memory is the sub-tile-grid bounding box, not the full
+    raster.
+
+    Called from inside a ``dask.delayed`` per-chunk task, so it runs
+    once per chunk and only pulls the tiles that chunk needs from disk.
+    """
+    from .._gpu_decode import gpu_decode_tiles, gpu_decode_tiles_from_file
+
+    tiles_across = (full_w + tw - 1) // tw
+
+    ty_start = r0 // th
+    ty_end = (r1 - 1) // th + 1
+    tx_start = c0 // tw
+    tx_end = (c1 - 1) // tw + 1
+
+    sub_tiles_across = tx_end - tx_start
+    sub_tiles_down = ty_end - ty_start
+    sub_h = sub_tiles_down * th
+    sub_w = sub_tiles_across * tw
+
+    indices = [ty * tiles_across + tx
+               for ty in range(ty_start, ty_end)
+               for tx in range(tx_start, tx_end)]
+    sub_offsets = [all_offsets[i] for i in indices]
+    sub_byte_counts = [all_byte_counts[i] for i in indices]
+
+    arr_gpu = gpu_decode_tiles_from_file(
+        file_path, sub_offsets, sub_byte_counts,
+        tw, th, sub_w, sub_h,
+        compression, predictor, file_dtype, samples,
+        byte_order=byte_order,
+    )
+
+    if arr_gpu is None:
+        # ``gpu_decode_tiles_from_file`` returns None when KvikIO is not
+        # usable on the host. Open the file via mmap, slice out just the
+        # bytes for these tiles, and run the GPU decoder on those.
+        from .._reader import _FileSource
+        src = _FileSource(file_path)
+        try:
+            data = src.read_all()
+            compressed_tiles = [
+                bytes(data[sub_offsets[i]:sub_offsets[i] + sub_byte_counts[i]])
+                for i in range(len(sub_offsets))
+            ]
+        finally:
+            src.close()
+        arr_gpu = gpu_decode_tiles(
+            compressed_tiles, tw, th, sub_w, sub_h,
+            compression, predictor, file_dtype, samples,
+            byte_order=byte_order,
+        )
+
+    crop_r0 = r0 - ty_start * th
+    crop_c0 = c0 - tx_start * tw
+    crop_r1 = crop_r0 + (r1 - r0)
+    crop_c1 = crop_c0 + (c1 - c0)
+    if samples > 1:
+        return arr_gpu[crop_r0:crop_r1, crop_c0:crop_c1, :]
+    return arr_gpu[crop_r0:crop_r1, crop_c0:crop_c1]
+
+
+def _read_geotiff_gpu_chunked(source, *, dtype, chunks, overview_level,
+                              window, band, name, max_pixels):
+    """Lazy Dask+CuPy backend for ``read_geotiff_gpu(chunks=...)``.
+
+    Two paths produce the same shape of dask graph:
+
+    1. **Direct disk->GPU** when KvikIO is available and the file is a
+       local, tiled, chunky GeoTIFF with no sparse tiles and a trivial
+       orientation. Each chunk task picks the tile subset for its
+       window, DMA's those tiles to the device via GDS, decodes on the
+       GPU, and crops. Peak GPU memory is one chunk and the file bytes
+       never traverse host memory.
+
+    2. **CPU decode + GPU upload** for everything else (HTTP / fsspec,
+       no KvikIO, planar=2, sparse, MinIsWhite, non-trivial orientation,
+       stripped layouts). Reuses ``read_geotiff_dask`` to build the
+       per-chunk windowed delayed graph and ``map_blocks(cupy.asarray)``
+       to upload each block. Peak GPU memory is still one chunk; the
+       cost is per-chunk CPU decode rather than GDS DMA.
+
+    Both paths are real out-of-core for device memory.
+    """
+    import cupy
+
+    from .._reader import _FileSource, _coerce_path
+    from .._header import parse_header, parse_all_ifds, select_overview_ifd
+    from .._geotags import extract_geo_info_with_overview_inheritance
+
+    src_path = _coerce_path(source)
+
+    # Try the disk->GPU path. Parse metadata once; if the file does not
+    # qualify, fall through to the CPU-decode path. Any unexpected
+    # exception during the qualification probe also falls through so we
+    # never lose the ability to return a result.
+    try:
+        if isinstance(src_path, str) and not src_path.startswith(
+                ('http://', 'https://')):
+            fs = _FileSource(src_path)
+            try:
+                raw = fs.read_all()
+            finally:
+                fs.close()
+            header = parse_header(raw)
+            ifds = parse_all_ifds(raw, header)
+            if not ifds:
+                raise ValueError("No IFDs found in TIFF file")
+            ifd = select_overview_ifd(ifds, overview_level)
+            geo_info = extract_geo_info_with_overview_inheritance(
+                ifd, ifds, raw, header.byte_order,
+            )
+            orientation = ifd.orientation
+            has_sparse_tile = (
+                ifd.tile_byte_counts is not None
+                and any(bc == 0 for bc in ifd.tile_byte_counts)
+            )
+            if _gds_chunk_path_available(
+                    src_path, ifd, has_sparse_tile, orientation):
+                return _read_geotiff_gpu_chunked_gds(
+                    src_path, ifd, geo_info, header,
+                    dtype=dtype, chunks=chunks, window=window, band=band,
+                    name=name, max_pixels=max_pixels,
+                )
+    except Exception:
+        # GDS qualification failed; fall back to the CPU path. The
+        # error would otherwise be unrelated to what the user asked
+        # for (the CPU path re-parses metadata anyway).
+        pass
+
+    # Lazy import to avoid a circular dependency with ``__init__.py``,
+    # which re-exports ``read_geotiff_gpu`` from this module. By the
+    # time this line executes both modules are fully loaded.
+    from .. import read_geotiff_dask
+
+    cpu_da = read_geotiff_dask(
+        source, dtype=dtype, chunks=chunks,
+        overview_level=overview_level, window=window, band=band,
+        max_pixels=max_pixels, name=name,
+    )
+
+    cpu_dask_arr = cpu_da.data
+
+    def _upload(block):
+        return cupy.asarray(block)
+
+    gpu_dask_arr = cpu_dask_arr.map_blocks(
+        _upload,
+        dtype=cpu_dask_arr.dtype,
+        meta=cupy.empty((0,) * cpu_dask_arr.ndim, dtype=cpu_dask_arr.dtype),
+    )
+
+    return xr.DataArray(
+        gpu_dask_arr, dims=cpu_da.dims, coords=cpu_da.coords,
+        name=cpu_da.name, attrs=dict(cpu_da.attrs),
+    )
+
+
+def _read_geotiff_gpu_chunked_gds(source, ifd, geo_info, header, *,
+                                  dtype, chunks, window, band, name,
+                                  max_pixels):
+    """Build a Dask+CuPy graph that decodes each chunk disk->GPU.
+
+    Caller must have verified that the source qualifies via
+    ``_gds_chunk_path_available``. Each chunk task pulls only the tile
+    subset overlapping its window via KvikIO GDS (or an mmap fallback
+    inside ``gpu_decode_tiles_from_file``) and crops on device.
+    """
+    import cupy
+    import dask
+    import dask.array as da_mod
+
+    from .._reader import _check_dimensions, MAX_PIXELS_DEFAULT
+    from .._header import validate_tile_layout
+    from .._dtypes import resolve_bits_per_sample, tiff_dtype_to_numpy
+
+    if max_pixels is None:
+        max_pixels = MAX_PIXELS_DEFAULT
+
+    full_h = ifd.height
+    full_w = ifd.width
+    samples = ifd.samples_per_pixel
+    bps = resolve_bits_per_sample(ifd.bits_per_sample)
+    file_dtype = tiff_dtype_to_numpy(bps, ifd.sample_format)
+    tw = ifd.tile_width
+    th = ifd.tile_height
+    compression = ifd.compression
+    predictor = ifd.predictor
+    byte_order = header.byte_order
+    offsets = list(ifd.tile_offsets)
+    byte_counts = list(ifd.tile_byte_counts)
+
+    _check_dimensions(full_w, full_h, samples, max_pixels)
+    _check_dimensions(tw, th, samples, max_pixels)
+    validate_tile_layout(ifd)
+
+    # Window restricts the visible region; offsets are computed relative
+    # to the windowed origin so chunks line up with the user's request.
+    if window is not None:
+        w_r0, w_c0, w_r1, w_c1 = window
+        if (w_r0 < 0 or w_c0 < 0 or w_r1 > full_h or w_c1 > full_w
+                or w_r0 >= w_r1 or w_c0 >= w_c1):
+            raise ValueError(
+                f"window={window} is out of bounds for image "
+                f"{full_w}x{full_h}.")
+        out_h, out_w = w_r1 - w_r0, w_c1 - w_c0
+        win_r0, win_c0 = w_r0, w_c0
+    else:
+        out_h, out_w = full_h, full_w
+        win_r0, win_c0 = 0, 0
+
+    if isinstance(chunks, int):
+        ch_h = ch_w = chunks
+    else:
+        ch_h, ch_w = chunks
+    if ch_h <= 0 or ch_w <= 0:
+        raise ValueError(f"Invalid chunks: {chunks}")
+
+    # Validate band kwarg against the file's band count.
+    n_bands_out = samples if samples > 1 else 0
+    if band is not None:
+        if n_bands_out == 0:
+            if band != 0:
+                raise IndexError(
+                    f"band={band} requested but file is single-band.")
+        elif band < 0 or band >= n_bands_out:
+            raise IndexError(
+                f"band={band} out of range for {n_bands_out}-band file.")
+
+    # Wrap the big tile-offset/byte-count tuples in a single Delayed so
+    # every chunk task takes them as one graph input rather than burning
+    # them into every task's pickled closure.
+    meta_key = dask.delayed(
+        (offsets, byte_counts), pure=True,
+    )
+
+    nodata = geo_info.nodata
+
+    @dask.delayed
+    def _chunk_task(meta, r0, c0, r1, c1):
+        all_offsets, all_byte_counts = meta
+        arr = _decode_window_gpu_direct(
+            source, all_offsets, all_byte_counts,
+            tw, th, full_w, compression, predictor,
+            file_dtype, samples, byte_order,
+            r0, c0, r1, c1,
+        )
+        if nodata is not None:
+            arr = _apply_nodata_mask_gpu(arr, nodata)
+        if dtype is not None:
+            target = np.dtype(dtype)
+            _validate_dtype_cast(np.dtype(str(arr.dtype)), target)
+            arr = arr.astype(target)
+        if band is not None and arr.ndim == 3:
+            arr = arr[:, :, band]
+        return arr
+
+    # Determine declared dtype for the dask graph. Nodata masking
+    # promotes integer rasters to float64; mirror the CPU dask path.
+    declared_dtype = file_dtype
+    if nodata is not None and file_dtype.kind in ('u', 'i'):
+        if np.isfinite(nodata) and float(nodata).is_integer():
+            info = np.iinfo(file_dtype)
+            if info.min <= int(nodata) <= info.max:
+                declared_dtype = np.dtype('float64')
+    if dtype is not None:
+        declared_dtype = np.dtype(dtype)
+
+    out_has_band_axis = band is None and n_bands_out > 0
+
+    blocks_rows = []
+    for r0 in range(0, out_h, ch_h):
+        r1 = min(r0 + ch_h, out_h)
+        blocks_cols = []
+        for c0 in range(0, out_w, ch_w):
+            c1 = min(c0 + ch_w, out_w)
+            if out_has_band_axis:
+                block_shape = (r1 - r0, c1 - c0, n_bands_out)
+            else:
+                block_shape = (r1 - r0, c1 - c0)
+            # Convert chunk coords to file-space coords.
+            block = da_mod.from_delayed(
+                _chunk_task(meta_key,
+                            r0 + win_r0, c0 + win_c0,
+                            r1 + win_r0, c1 + win_c0),
+                shape=block_shape,
+                dtype=declared_dtype,
+                meta=cupy.empty((0,) * len(block_shape),
+                                dtype=declared_dtype),
+            )
+            blocks_cols.append(block)
+        blocks_rows.append(da_mod.concatenate(blocks_cols, axis=1))
+    dask_arr = da_mod.concatenate(blocks_rows, axis=0)
+
+    # Build coords/attrs that match read_geotiff_dask's output.
+    coords = _coords_from_geo_info(geo_info, out_h, out_w, window=window)
+    if out_has_band_axis:
+        dims = ['y', 'x', 'band']
+        coords['band'] = np.arange(n_bands_out)
+    else:
+        dims = ['y', 'x']
+
+    attrs = {}
+    _populate_attrs_from_geo_info(attrs, geo_info, window=window)
+    if nodata is not None:
+        attrs['nodata'] = nodata
+
+    if name is None:
+        import os
+        name = os.path.splitext(os.path.basename(source))[0]
+
+    return xr.DataArray(
+        dask_arr, dims=dims, coords=coords, name=name, attrs=attrs,
+    )
diff --git a/xrspatial/geotiff/tests/test_gpu_chunks_out_of_core_1876.py b/xrspatial/geotiff/tests/test_gpu_chunks_out_of_core_1876.py
index 506b2791..06dcd061 100644
--- a/xrspatial/geotiff/tests/test_gpu_chunks_out_of_core_1876.py
+++ b/xrspatial/geotiff/tests/test_gpu_chunks_out_of_core_1876.py
@@ -161,7 +161,7 @@ def test_read_geotiff_gpu_chunks_uses_gds_path_when_available(
     task must call the direct disk->GPU decoder rather than detouring
     through ``read_geotiff_dask``."""
     from xrspatial.geotiff import read_geotiff_gpu
-    from xrspatial import geotiff as gtmod
+    from xrspatial.geotiff._backends import gpu as gtmod
 
     direct_calls = {'n': 0}
     real_direct = gtmod._decode_window_gpu_direct
@@ -245,7 +245,7 @@ def test_read_geotiff_gpu_chunks_fallback_when_kvikio_absent(
     import importlib.util as _ilu
 
     from xrspatial.geotiff import read_geotiff_gpu
-    from xrspatial import geotiff as gtmod
+    from xrspatial.geotiff._backends import gpu as gtmod
 
     original_find_spec = _ilu.find_spec
 

From 9275b5b57a119af08d6327fd8b187a343c292e29 Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Thu, 14 May 2026 21:00:39 -0700
Subject: [PATCH 2/2] geotiff: address Copilot review on _backends/gpu.py
 extraction (#1895)

Drop five GPU-helper re-imports from __init__.py that no longer have
internal callers after ``read_geotiff_gpu`` moved out:

- ``_apply_nodata_mask_gpu``
- ``_apply_orientation_geo_info``
- ``_apply_orientation_gpu``
- ``_gpu_apply_window_band``
- ``_gpu_decode_single_band_tiles``

``_is_gpu_data`` is still imported because ``to_geotiff`` and
``write_geotiff_gpu`` (which stay in __init__.py for now) reference
it to detect cupy-backed inputs.

The two parity-bug comments Copilot flagged on _backends/gpu.py
(boolean band validation missing on the GDS chunked path, LERC
masked_fill dropped on the GDS chunked path) are pre-existing in
the function bodies as they lived in __init__.py before this move.
They will be filed as their own follow-up issue against #1813 so
this PR stays pure code motion.
---
 xrspatial/geotiff/__init__.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py
index 6c8d364a..7ad72b8f 100644
--- a/xrspatial/geotiff/__init__.py
+++ b/xrspatial/geotiff/__init__.py
@@ -68,14 +68,7 @@
     _populate_attrs_from_geo_info,
     _resolve_nodata_attr,
 )
-from ._backends._gpu_helpers import (
-    _apply_nodata_mask_gpu,
-    _apply_orientation_geo_info,
-    _apply_orientation_gpu,
-    _gpu_apply_window_band,
-    _gpu_decode_single_band_tiles,
-    _is_gpu_data,
-)
+from ._backends._gpu_helpers import _is_gpu_data
 from ._backends.gpu import read_geotiff_gpu
 from ._crs import _resolve_crs_to_wkt, _wkt_to_epsg
 from ._reader import read_to_array as _read_to_array