Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion xrspatial/geotiff/_cog_http.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@
# so monkeypatches against the ``_reader`` namespace continue to
# intercept the source. PR-J / #2258.
_HTTPSource,
_max_coalesced_range_bytes_from_env,
_max_tile_bytes_from_env,
)
from ._validation import _validate_predictor_sample_format
Expand Down Expand Up @@ -901,6 +902,13 @@ def _fetch_decode_cog_http_tiles(
# tolerates small interleaved metadata between tiles without dragging
# in unrelated overview data. Set XRSPATIAL_COG_COALESCE_GAP=-1 to
# disable merging (one GET per tile, the legacy behaviour).
#
# The merged-range size cap (issue #2266) is resolved here too so
# the call below is self-documenting: a reader can see at the call
# site that both ``gap_threshold`` and ``max_coalesced_range_bytes``
# are governed by env vars. Without the explicit lookup the cap
# would still apply -- ``coalesce_ranges`` resolves a ``None`` cap
# against the same env var -- but the asymmetry would hide that.
try:
workers = max(1, int(_os_module.environ.get('XRSPATIAL_COG_HTTP_WORKERS', '8')))
except ValueError:
Expand All @@ -911,8 +919,12 @@ def _fetch_decode_cog_http_tiles(
str(COALESCE_GAP_THRESHOLD_DEFAULT)))
except ValueError:
gap = COALESCE_GAP_THRESHOLD_DEFAULT
max_coalesced = _max_coalesced_range_bytes_from_env()
tile_bytes_list = source.read_ranges_coalesced(
fetch_ranges, max_workers=workers, gap_threshold=gap)
fetch_ranges,
max_workers=workers,
gap_threshold=gap,
max_coalesced_range_bytes=max_coalesced)

# Pass 3: decode each tile and place it (clipped to the window).
#
Expand Down
2 changes: 2 additions & 0 deletions xrspatial/geotiff/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@
# Public module-level constants.
COALESCE_GAP_THRESHOLD_DEFAULT,
MAX_CLOUD_BYTES_DEFAULT,
MAX_COALESCED_RANGE_BYTES_DEFAULT,
MAX_TILE_BYTES_DEFAULT,
# Private module-level constants and sentinels.
_CLOUD_SCHEMES,
Expand Down Expand Up @@ -116,6 +117,7 @@
_is_file_like,
_is_fsspec_uri,
_make_pinned_pool,
_max_coalesced_range_bytes_from_env,
_max_tile_bytes_from_env,
_mmap_cache_size_from_env,
_open_source,
Expand Down
95 changes: 85 additions & 10 deletions xrspatial/geotiff/_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -546,10 +546,40 @@ def _validate_http_url(url: str) -> str | None:
#: O(num_tiles) bytes plus at most one threshold of slack between tiles.
COALESCE_GAP_THRESHOLD_DEFAULT = 1 << 20 # 1 MB

#: Default upper bound (bytes) on any single coalesced range. The gap
#: threshold alone does not bound the *total* over-fetch: a tile table
#: with N entries whose offsets are spaced just under ``gap_threshold``
#: apart will chain into one merged range of size ~N * gap_threshold,
#: even when each individual tile is tiny and passes the per-tile cap.
#: This cap seals the current merged range and starts a new one once
#: extending it would exceed the limit. Override via the
#: ``XRSPATIAL_COG_MAX_COALESCED_RANGE_BYTES`` environment variable.
#: Issue #2266.
MAX_COALESCED_RANGE_BYTES_DEFAULT = MAX_TILE_BYTES_DEFAULT # 256 MiB


def _max_coalesced_range_bytes_from_env() -> int:
"""Read the coalesced-range cap from the environment, or use the default.

Non-integer, empty, zero, or negative values all fall back to
``MAX_COALESCED_RANGE_BYTES_DEFAULT``. Mirrors the policy used by
:func:`_max_tile_bytes_from_env` so callers can not accidentally set
an unreachable 1-byte cap.
"""
raw = _os_module.environ.get('XRSPATIAL_COG_MAX_COALESCED_RANGE_BYTES')
if raw is None:
return MAX_COALESCED_RANGE_BYTES_DEFAULT
try:
val = int(raw)
except (TypeError, ValueError):
return MAX_COALESCED_RANGE_BYTES_DEFAULT
return val if val > 0 else MAX_COALESCED_RANGE_BYTES_DEFAULT


def coalesce_ranges(
ranges: list[tuple[int, int]],
gap_threshold: int = COALESCE_GAP_THRESHOLD_DEFAULT,
max_coalesced_range_bytes: int | None = None,
) -> tuple[list[tuple[int, int]], list[tuple[int, int, int]]]:
"""Merge nearby ``(offset, length)`` ranges into fewer larger ones.

Expand All @@ -562,6 +592,16 @@ def coalesce_ranges(
Maximum gap, in bytes, between two adjacent ranges before they
are merged. A gap of zero means perfectly back-to-back; larger
gaps trade some over-fetch for fewer round-trips.
max_coalesced_range_bytes : int or None, optional
Upper bound on any single merged range. When extending the
current merged range would push its length above this cap, the
current range is sealed and a new one is started instead. This
bounds the *total* over-fetch even when many small ranges are
spaced just under ``gap_threshold`` apart. ``None`` (the
default) reads the cap from
``XRSPATIAL_COG_MAX_COALESCED_RANGE_BYTES`` (falling back to
:data:`MAX_COALESCED_RANGE_BYTES_DEFAULT`, 256 MiB). A
non-positive value disables the cap. Issue #2266.

Returns
-------
Expand All @@ -575,11 +615,19 @@ def coalesce_ranges(
Notes
-----
Empty input returns ``([], [])``. Negative gap thresholds disable
merging entirely (every input becomes its own merged range).
merging entirely (every input becomes its own merged range). When a
single input range already exceeds ``max_coalesced_range_bytes`` it
is still emitted intact -- the per-tile cap in
:func:`_max_tile_bytes_from_env` is the right place to reject
oversized individual tiles; this cap only governs how greedily
*separate* tiles are stitched together.
"""
if not ranges:
return [], []

if max_coalesced_range_bytes is None:
max_coalesced_range_bytes = _max_coalesced_range_bytes_from_env()

# Tag each input with its original index so we can rebuild mapping.
indexed = sorted(
((off, length, i) for i, (off, length) in enumerate(ranges)),
Expand All @@ -596,12 +644,21 @@ def coalesce_ranges(

for off, length, orig_idx in indexed[1:]:
gap = off - cur_end
if gap_threshold >= 0 and gap <= gap_threshold:
# Extend current merged range. Gaps may be negative if a
# later-listed range overlaps an earlier one; clamp so the
# merged length covers both.
new_end = max(cur_end, off + length)
cur_length = new_end - cur_start
# Gaps may be negative if a later-listed range overlaps an
# earlier one; clamp ``new_end`` so the merged length covers
# both. ``candidate_length`` is the length the merged range
# would have if we extended it to include this input. We use
# it both to decide whether the merge is allowed under the
# size cap and (when it is) to update ``cur_length``.
new_end = max(cur_end, off + length)
candidate_length = new_end - cur_start
size_ok = (
max_coalesced_range_bytes <= 0
or candidate_length <= max_coalesced_range_bytes
)
if gap_threshold >= 0 and gap <= gap_threshold and size_ok:
# Extend current merged range.
cur_length = candidate_length
cur_end = new_end
members.append((orig_idx, off, length))
else:
Expand Down Expand Up @@ -1151,6 +1208,7 @@ def read_ranges_coalesced(
ranges: list[tuple[int, int]],
max_workers: int = 8,
gap_threshold: int = COALESCE_GAP_THRESHOLD_DEFAULT,
max_coalesced_range_bytes: int | None = None,
) -> list[bytes]:
"""Fetch *ranges* using merged GETs where adjacent ranges allow it.

Expand All @@ -1162,10 +1220,20 @@ def read_ranges_coalesced(

Setting *gap_threshold* to a negative number disables merging
and falls back to one GET per input range.

*max_coalesced_range_bytes* caps the size of any single merged
GET. ``None`` (the default) reads the cap from
``XRSPATIAL_COG_MAX_COALESCED_RANGE_BYTES`` and otherwise uses
:data:`MAX_COALESCED_RANGE_BYTES_DEFAULT`. See
:func:`coalesce_ranges` for details. Issue #2266.
"""
if not ranges:
return []
merged, mapping = coalesce_ranges(ranges, gap_threshold=gap_threshold)
merged, mapping = coalesce_ranges(
ranges,
gap_threshold=gap_threshold,
max_coalesced_range_bytes=max_coalesced_range_bytes,
)
merged_bytes = self.read_ranges(merged, max_workers=max_workers)
return split_coalesced_bytes(merged_bytes, mapping)

Expand Down Expand Up @@ -1423,16 +1491,23 @@ def read_ranges_coalesced(
ranges: list[tuple[int, int]],
max_workers: int = 8,
gap_threshold: int = COALESCE_GAP_THRESHOLD_DEFAULT,
max_coalesced_range_bytes: int | None = None,
) -> list[bytes]:
"""Fetch *ranges* using merged GETs where adjacent ranges allow it.

Mirrors :meth:`_HTTPSource.read_ranges_coalesced` so the tiled
COG decode path can coalesce neighbouring tiles when reading
from object storage.
from object storage. ``max_coalesced_range_bytes`` caps the
size of any single merged GET; see :func:`coalesce_ranges`.
Issue #2266.
"""
if not ranges:
return []
merged, mapping = coalesce_ranges(ranges, gap_threshold=gap_threshold)
merged, mapping = coalesce_ranges(
ranges,
gap_threshold=gap_threshold,
max_coalesced_range_bytes=max_coalesced_range_bytes,
)
merged_bytes = self.read_ranges(merged, max_workers=max_workers)
return split_coalesced_bytes(merged_bytes, mapping)

Expand Down
144 changes: 144 additions & 0 deletions xrspatial/geotiff/tests/test_http_cog_coalesce.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,122 @@ def test_coalesce_split_recovers_per_tile_bytes():
assert tile == payload[off:off + length]


# ---------------------------------------------------------------------------
# Issue #2266: coalesced-range size cap. Without this cap a tile table
# with many small valid byte counts and sub-MiB gaps would chain into
# one merged range whose length is roughly num_tiles * gap_threshold,
# turning a safe per-tile fetch into a multi-GiB over-fetch.
# ---------------------------------------------------------------------------

def test_coalesce_caps_merged_range_size_2266():
# 8 tiny ranges spaced 1 MiB apart. Every gap is within the default
# 1 MiB threshold so without the size cap they would all merge into
# one ~7 MiB range. With a 4 MiB cap the coalescer must split. The
# next test (``test_coalesce_cap_round_trips_bytes_2266``) covers
# byte-level recovery after the split.
one_mib = 1 << 20
ranges = [(i * one_mib, 1024) for i in range(8)]
merged, mapping = coalesce_ranges(
ranges, max_coalesced_range_bytes=4 * one_mib)
# No merged range exceeds the cap.
for _start, length in merged:
assert length <= 4 * one_mib, (
f'merged range of {length} bytes exceeds 4 MiB cap')
# Splitting still happened: more than one merged range.
assert len(merged) > 1
# Every input is still represented in the mapping.
assert len(mapping) == len(ranges)


def test_coalesce_cap_round_trips_bytes_2266():
# When the cap forces a split, split_coalesced_bytes must still
# recover every original byte range correctly.
one_mib = 1 << 20
payload_len = 8 * one_mib + 1024
# Use a deterministic payload we can slice and compare against.
payload = bytes((i * 17) & 0xFF for i in range(payload_len))
ranges = [(i * one_mib, 1024) for i in range(8)]

merged, mapping = coalesce_ranges(
ranges, max_coalesced_range_bytes=4 * one_mib)
merged_bytes = [payload[s:s + le] for (s, le) in merged]
out = split_coalesced_bytes(merged_bytes, mapping)

for (off, length), tile in zip(ranges, out):
assert tile == payload[off:off + length]


def test_coalesce_default_cap_bounds_adversarial_input_2266():
# The motivating scenario from issue #2266: 4096 tiles, each 1 KB,
# with offsets spaced 1 MiB apart. Without the cap this collapses
# into one ~4 GiB merged range. With the default cap nothing
# exceeds MAX_COALESCED_RANGE_BYTES_DEFAULT.
from xrspatial.geotiff._sources import (
MAX_COALESCED_RANGE_BYTES_DEFAULT,
)

one_mib = 1 << 20
ranges = [(i * one_mib, 1024) for i in range(4096)]
merged, _ = coalesce_ranges(ranges)
for _start, length in merged:
assert length <= MAX_COALESCED_RANGE_BYTES_DEFAULT, (
f'merged range {length} bytes exceeds default cap '
f'{MAX_COALESCED_RANGE_BYTES_DEFAULT} bytes')


def test_coalesce_cap_zero_disables_size_check_2266():
# A non-positive cap means "no size limit" -- the gap threshold
# alone governs merging. Useful as an escape hatch for callers
# that have their own bookkeeping.
one_mib = 1 << 20
ranges = [(i * one_mib, 1024) for i in range(8)]
merged, _ = coalesce_ranges(
ranges, max_coalesced_range_bytes=0)
# All eight merge into one ~7 MiB + 1 KB range.
assert len(merged) == 1
_, length = merged[0]
assert length == 7 * one_mib + 1024


def test_coalesce_cap_does_not_split_legitimate_back_to_back_2266():
# The cap must not punish well-behaved COGs whose tiles really are
# back-to-back. A real COG with 64 tiles of 64 KB each (total 4 MiB)
# should still collapse into a single GET under the default cap.
tile_bytes = 64 * 1024
n_tiles = 64
ranges = [(i * tile_bytes, tile_bytes) for i in range(n_tiles)]
merged, _ = coalesce_ranges(ranges)
assert len(merged) == 1
assert merged[0] == (0, n_tiles * tile_bytes)


def test_coalesce_cap_respects_env_override_2266(monkeypatch):
# When max_coalesced_range_bytes is None (the default), the helper
# reads XRSPATIAL_COG_MAX_COALESCED_RANGE_BYTES from the environment.
one_mib = 1 << 20
ranges = [(i * one_mib, 1024) for i in range(8)]
# Force a 2 MiB cap via env. The 8 ranges spaced 1 MiB apart must
# split into at least 4 merged ranges (2 MiB each + slack).
monkeypatch.setenv(
'XRSPATIAL_COG_MAX_COALESCED_RANGE_BYTES', str(2 * one_mib))
merged, _ = coalesce_ranges(ranges)
for _start, length in merged:
assert length <= 2 * one_mib
assert len(merged) >= 4


def test_coalesce_cap_preserves_oversized_single_input_2266():
# If a single input range already exceeds the cap, the function
# still emits it intact. Rejecting oversized individual tiles is
# the job of the per-tile cap, not the coalescer.
big = 10 * (1 << 20) # 10 MiB
ranges = [(0, big)]
merged, mapping = coalesce_ranges(
ranges, max_coalesced_range_bytes=1 << 20) # 1 MiB cap
assert merged == [(0, big)]
assert mapping == [(0, 0, big)]


# ---------------------------------------------------------------------------
# Mocked HTTP source for perf and call-count assertions
# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -148,6 +264,34 @@ def read_all(self) -> bytes:
return self._buf


def test_http_source_read_ranges_coalesced_respects_cap_2266():
"""The HTTP wrapper must propagate the size cap to coalesce_ranges.

Builds a 16 MiB in-memory buffer, then asks the source to fetch
eight 1 KB ranges spaced 1 MiB apart. Without the cap the wrapper
would issue a single ~7 MiB merged GET; with a 4 MiB cap it issues
at least two smaller GETs.
"""
one_mib = 1 << 20
buf = bytes((i * 13) & 0xFF for i in range(16 * one_mib))
src = _MockHTTPSource(buf)
ranges = [(i * one_mib, 1024) for i in range(8)]

out = src.read_ranges_coalesced(
ranges, max_workers=2,
max_coalesced_range_bytes=4 * one_mib)
# Bytes must match the original per-range slices.
for (off, length), tile in zip(ranges, out):
assert tile == buf[off:off + length]
# The actual GETs the mock saw must all respect the cap.
assert src.calls, 'no GETs were issued'
for _start, length in src.calls:
assert length <= 4 * one_mib, (
f'merged GET of {length} bytes exceeds 4 MiB cap')
# And the cap must have caused at least one split.
assert len(src.calls) >= 2


@pytest.fixture
def small_cog_bytes(tmp_path):
"""Build a small tiled COG and return its raw bytes."""
Expand Down
14 changes: 12 additions & 2 deletions xrspatial/geotiff/tests/test_security.py
Original file line number Diff line number Diff line change
Expand Up @@ -612,14 +612,24 @@ def read_all(self) -> bytes:
def read_ranges(self, ranges, max_workers=8):
return [self.read_range(s, le) for s, le in ranges]

def read_ranges_coalesced(self, ranges, max_workers=8, gap_threshold=None):
def read_ranges_coalesced(
self,
ranges,
max_workers=8,
gap_threshold=None,
max_coalesced_range_bytes=None,
):
from xrspatial.geotiff._reader import (
coalesce_ranges, split_coalesced_bytes,
COALESCE_GAP_THRESHOLD_DEFAULT,
)
if gap_threshold is None:
gap_threshold = COALESCE_GAP_THRESHOLD_DEFAULT
merged, mapping = coalesce_ranges(ranges, gap_threshold=gap_threshold)
merged, mapping = coalesce_ranges(
ranges,
gap_threshold=gap_threshold,
max_coalesced_range_bytes=max_coalesced_range_bytes,
)
merged_bytes = self.read_ranges(merged, max_workers=max_workers)
return split_coalesced_bytes(merged_bytes, mapping)

Expand Down
Loading
Loading