diff --git a/xrspatial/geotiff/_sources.py b/xrspatial/geotiff/_sources.py index 53192069..33dd28aa 100644 --- a/xrspatial/geotiff/_sources.py +++ b/xrspatial/geotiff/_sources.py @@ -992,7 +992,46 @@ def _pool_for_request(self, url: str, pinned_ip: str | None): return self._pool return self._get_pinned_pool(scheme, host, parsed.port, pinned_ip) + #: Absolute ceiling for a ``read_range(0, length)`` whose server + #: ignores ``Range`` and returns a 200 with the full object body. Up + #: to this size we tolerate the misbehaviour (the response is sliced + #: down to ``length``); beyond it we refuse to buffer. The value is + #: large enough to cover a typical sidecar/header prefetch served + #: from a Range-blind origin (kilobytes to a few MiB) but well below + #: the multi-gigabyte regime the OOM guard exists to prevent. + #: + #: This is the worst-case *transient* buffer per ``read_range`` + #: call. :meth:`read_ranges` fans out across up to 8 threads, so a + #: full pool against a Range-blind server can hold roughly + #: ``8 * _RANGE_IGNORED_FULL_OBJECT_CAP`` (~128 MiB) of body bytes + #: in flight at once. That is still bounded -- the pre-#2264 path + #: had no per-call bound at all -- but worth keeping in mind if a + #: future caller wants to scale ``max_workers`` higher. Issue #2264. + _RANGE_IGNORED_FULL_OBJECT_CAP = 16 * 1024 * 1024 + def read_range(self, start: int, length: int) -> bytes: + """Fetch ``[start, start + length)`` from the remote object. + + Sends ``Range: bytes={start}-{start + length - 1}`` and validates + the response on three axes before returning: + + - status is 200 or 206 (anything else raises ``OSError``); + - the advertised ``Content-Length`` (or the streamed body, if no + header) fits inside the per-call byte budget -- ``length`` + bytes for a 206 or non-zero start, or + :attr:`_RANGE_IGNORED_FULL_OBJECT_CAP` for the ``start=0`` + + 200-with-no-Content-Range fallback path; + - the ``Content-Range`` header (when present) actually starts at + ``start``. + + The body is streamed with ``preload_content=False`` so an + oversize response cannot land in ``resp.data`` before the cap is + applied; :meth:`_read_capped` aborts past the byte budget. + + Raises ``OSError`` when the server's body is past the budget, + the status code is wrong, or the ``Content-Range`` does not line + up with what was requested. Issue #2264. + """ # Match the ``b''``-for-non-positive-length convention used by # other source implementations (e.g. ``_BytesIOSource``). # Without this guard, ``Range: bytes=-`` goes on @@ -1003,14 +1042,79 @@ def read_range(self, start: int, length: int) -> bytes: return b'' end = start + length - 1 headers = {'Range': f'bytes={start}-{end}'} - resp = self._request(headers=headers) - return self._validate_range_response( - status=resp.status, - content_range=resp.headers.get('Content-Range'), - data=resp.data, - start=start, - length=length, - ) + # Stream the body so a server that ignores ``Range`` and returns + # the full object as 200 cannot drag arbitrarily many bytes into + # process memory before the cap is applied. Without + # ``preload_content=False`` urllib3 buffers the full response into + # ``resp.data`` *before* this method returns, so the slice in + # ``_validate_range_response`` only ever ran after the body was + # already resident. Mirrors the ``read_all`` streaming path that + # #2051 introduced. Issue #2264. + resp = self._request(headers=headers, preload_content=False) + try: + content_range = resp.headers.get('Content-Range') + content_length = resp.headers.get('Content-Length') + status = resp.status + # Choose the byte budget for ``_read_capped``. A 206 with + # Content-Range is bounded by ``length`` (the validator + # below also re-checks this against the advertised range). + # A 200 without Content-Range means the server ignored + # ``Range``; allow the full-object slack only when the + # caller asked from ``start=0`` (the only case where the + # full body offsets line up with what was requested). + if content_range is None and start == 0: + cap = self._RANGE_IGNORED_FULL_OBJECT_CAP + else: + cap = length + self._check_range_content_length(content_length, cap) + data = self._read_capped(resp, cap) + # Validate inside the try so we hold the response alive while + # status/headers are inspected. ``release_conn`` in the + # finally happens after the validator returns its bytes (or + # raises). + return self._validate_range_response( + status=status, + content_range=content_range, + data=data, + start=start, + length=length, + ) + finally: + try: + resp.release_conn() + except Exception: # noqa: BLE001 + pass + + @staticmethod + def _check_range_content_length(raw, cap: int) -> None: + """Reject a range response whose advertised ``Content-Length`` + exceeds the byte budget for the call. + + For a ``Range: bytes=S-E`` request, a well-behaved server returns + either a 206 with body length ``E - S + 1`` or a 200 with the + full object body. A server that ignores ``Range`` and returns a + multi-gigabyte 200 advertises that size up front; rejecting on + the header lets us bail before any body bytes are read. + + Missing or unparseable ``Content-Length`` returns silently -- + the streaming cap in :meth:`_read_capped` is the real defence + and will catch an over-sized body whether the header was honest, + dishonest, or absent. Issue #2264. + """ + if raw is None: + return + try: + declared = int(raw) + except (TypeError, ValueError): + return + if declared > cap: + raise OSError( + f"HTTP range response declares Content-Length=" + f"{declared:,} bytes, which exceeds the byte budget of " + f"{cap:,} bytes for this request. The server likely " + f"ignored the Range header and returned a multi-MiB " + f"body. Issue #2264." + ) @staticmethod def _validate_range_response(*, status, content_range, data, @@ -1066,8 +1170,12 @@ def _validate_range_response(*, status, content_range, data, raise OSError("HTTP 200 response body is empty.") # Server ignored Range and returned the full object as 200. # The implicit contract is "at most ``length`` bytes"; slice - # so a 16 KiB prefetch against a 2 GiB object doesn't drag - # the whole thing into memory. + # the buffered body down. ``read_range`` has already capped + # the buffer at :attr:`_RANGE_IGNORED_FULL_OBJECT_CAP` (16 + # MiB) via the streaming preflight, so ``data`` arriving here + # is bounded even when the server lied about the body size; + # the slice below just enforces the caller's contract. + # Issue #2264. if len(data) > length: return data[:length] return data @@ -1260,7 +1368,7 @@ def _read_capped(resp, max_bytes: int) -> bytes: f"HTTP response body exceeded the byte budget of " f"{max_bytes:,} bytes (received {received:,} bytes " f"before abort). The server likely ignored or lied " - f"about Content-Length. Issue #2051." + f"about Content-Length. Issues #2051, #2264." ) return b''.join(chunks) diff --git a/xrspatial/geotiff/tests/test_dns_rebinding_pin_issue_1846.py b/xrspatial/geotiff/tests/test_dns_rebinding_pin_issue_1846.py index 14aebe45..e143ed34 100644 --- a/xrspatial/geotiff/tests/test_dns_rebinding_pin_issue_1846.py +++ b/xrspatial/geotiff/tests/test_dns_rebinding_pin_issue_1846.py @@ -232,6 +232,19 @@ def __init__(self, status: int, location: str | None = None, self.status = status self.headers = {'Location': location} if location else {} self.data = data + self._body = data + if data: + self.headers['Content-Length'] = str(len(data)) + + def stream(self, amt=65536, decode_content=True): + if self._body: + yield self._body + + def release_conn(self): + pass + + def drain_conn(self): + pass class _MockPool: diff --git a/xrspatial/geotiff/tests/test_http_no_stdlib_fallback_2050.py b/xrspatial/geotiff/tests/test_http_no_stdlib_fallback_2050.py index 03c8ca80..a7a80299 100644 --- a/xrspatial/geotiff/tests/test_http_no_stdlib_fallback_2050.py +++ b/xrspatial/geotiff/tests/test_http_no_stdlib_fallback_2050.py @@ -115,12 +115,29 @@ def _resolver(host, port, *args, **kwargs): class _MockResp: - def __init__(self, status, data=b'', content_range=None): + def __init__(self, status, data=b'', content_range=None, + content_length=None): self.status = status self.data = data + self._body = data self.headers = {} if content_range is not None: self.headers['Content-Range'] = content_range + # ``read_range`` (post #2264) does a Content-Length preflight; let + # callers either pin it explicitly or default to len(data). + if content_length is None and data: + self.headers['Content-Length'] = str(len(data)) + elif content_length is not None: + self.headers['Content-Length'] = str(content_length) + + def stream(self, amt=65536, decode_content=True): + # Yield the body in a single chunk; ``_read_capped`` reads + # whatever ``stream()`` produces. + if self._body: + yield self._body + + def release_conn(self): + pass class _MockPool: @@ -150,6 +167,10 @@ def test_read_range_uses_urllib3_pool(monkeypatch): assert method == 'GET' assert kwargs.get('redirect') is False assert kwargs.get('headers', {}).get('Range') == 'bytes=0-99' + # Post #2264: the GET must request a streaming body so the cap is + # enforced on the wire rather than after urllib3 has already + # buffered ``resp.data``. + assert kwargs.get('preload_content') is False def test_read_all_uses_urllib3_pool(monkeypatch): diff --git a/xrspatial/geotiff/tests/test_http_range_validation_1735.py b/xrspatial/geotiff/tests/test_http_range_validation_1735.py index a74f47fa..34dbebe6 100644 --- a/xrspatial/geotiff/tests/test_http_range_validation_1735.py +++ b/xrspatial/geotiff/tests/test_http_range_validation_1735.py @@ -66,7 +66,14 @@ def do_GET(self): # noqa: N802 url, httpd, _ = _serve(_Handler) try: src = _HTTPSource(url) - with pytest.raises(OSError, match="Content-Range|range fetch"): + # Post #2264 ``read_range`` rejects on the Content-Length + # preflight before any body bytes are read; pre-#2264 the + # ``_validate_range_response`` step rejected on + # Content-Range/range-fetch grounds after the body was already + # buffered. Both wordings prove the request was refused. + with pytest.raises( + OSError, + match="Content-Range|Content-Length|range fetch"): src.read_range(8, 16) finally: _stop(httpd) @@ -184,19 +191,29 @@ def do_GET(self): # noqa: N802 _stop(httpd) -def test_range_ignored_200_with_full_body_is_sliced_to_length(): - """Server ignores ``Range`` for ``start=0`` and returns the full - object as a 200 with no ``Content-Range`` -> response is sliced to - the requested length. +def test_range_ignored_200_oversize_rejected_via_content_length( + monkeypatch): + """Server ignores ``Range`` for ``start=0`` and returns a 200 with + a ``Content-Length`` past the full-object slack cap. + + Before #2264, ``read_range`` buffered the entire body into + ``resp.data`` (urllib3 default ``preload_content=True``) and then + sliced down to ``length``. That defeated the OOM guard the slice + comment claimed: a 16 KiB prefetch against a 2 GiB body still + pulled 2 GiB into memory before the slice ran. The fix caps the + fallback at :attr:`_HTTPSource._RANGE_IGNORED_FULL_OBJECT_CAP` and + rejects on the ``Content-Length`` preflight before any body bytes + are read. - The implicit contract of ``read_range`` is "at most ``length`` - bytes". A 16 KiB header prefetch against a 2 GiB object must not - drag the whole thing into memory when the server misbehaves. + Drop the cap to a small value here so the test does not have to + stand up a multi-MiB payload to trigger rejection. """ + monkeypatch.setattr( + _HTTPSource, '_RANGE_IGNORED_FULL_OBJECT_CAP', 1024) class _Handler(_BaseHandler): - # Payload larger than the requested length so we can verify the - # slice happens. + # Payload larger than the patched cap so the preflight has + # something to reject. payload = b'\xab' * 4096 def do_GET(self): # noqa: N802 @@ -209,9 +226,156 @@ def do_GET(self): # noqa: N802 url, httpd, _ = _serve(_Handler) try: src = _HTTPSource(url) - # start=0 -> validator should slice rather than raise. + with pytest.raises(OSError, match="Content-Length|byte budget"): + src.read_range(0, 64) + finally: + _stop(httpd) + + +def test_range_ignored_200_full_object_sliced_within_cap(): + """Server ignores ``Range`` for ``start=0`` and returns the full + object as 200 with no ``Content-Range``. When the body fits + inside the full-object slack cap, ``read_range`` slices it down + to the requested length. + + This is the legitimate small-file fallback: the caller asked for + a 64-byte prefetch, the file is a few KiB, and the server doesn't + honour Range. Pre-#2264 the slice happened after the whole body + was already in ``resp.data``; post-#2264 the body is bounded by + the streaming cap on the wire. + """ + + class _Handler(_BaseHandler): + payload = b'\xcd' * 4096 + + def do_GET(self): # noqa: N802 + self.send_response(200) + self.send_header('Content-Length', str(len(self.payload))) + self.end_headers() + self.wfile.write(self.payload) + + url, httpd, _ = _serve(_Handler) + try: + src = _HTTPSource(url) out = src.read_range(0, 64) - assert len(out) == 64 + # Caller's "at most length bytes" contract holds even when the + # server returned a much larger body. assert out == _Handler.payload[:64] + assert len(out) == 64 + finally: + _stop(httpd) + + +def test_range_ignored_200_short_body_returned_as_is(): + """A 200 fallback whose body is smaller than the requested length + is returned unchanged (no slicing needed). + + This is the "tiny file served by a Range-blind origin" case: the + caller asked for a 64-byte header prefetch but the whole object + is only 40 bytes. + """ + + class _Handler(_BaseHandler): + payload = b'\xef' * 40 + + def do_GET(self): # noqa: N802 + self.send_response(200) + self.send_header('Content-Length', str(len(self.payload))) + self.end_headers() + self.wfile.write(self.payload) + + url, httpd, _ = _serve(_Handler) + try: + src = _HTTPSource(url) + out = src.read_range(0, 64) + assert out == _Handler.payload + assert len(out) == 40 finally: _stop(httpd) + + +def test_range_ignored_200_no_content_length_is_streamed_and_capped( + monkeypatch): + """Server omits ``Content-Length`` and streams a body larger than + the full-object slack cap. ``_read_capped`` must abort once more + than the cap has arrived, so the body never gets fully buffered + into Python memory. + + This is the second half of the #2264 fix: the ``Content-Length`` + preflight catches honest oversize, the streaming cap (via chunked + transfer encoding here, since the server omits ``Content-Length``) + catches the case where the server volunteers no advertised size. + + Drop the full-object cap to a small value to keep the test fast. + """ + monkeypatch.setattr( + _HTTPSource, '_RANGE_IGNORED_FULL_OBJECT_CAP', 2048) + + class _Handler(_BaseHandler): + def do_GET(self): # noqa: N802 + # No Content-Length; use chunked transfer encoding. + self.send_response(200) + self.send_header('Transfer-Encoding', 'chunked') + self.end_headers() + # Each chunk is 1024 bytes; send 8 of them (8192 total), + # past the 2048-byte patched cap. + chunk = b'\xee' * 1024 + chunk_header = f'{len(chunk):x}\r\n'.encode() + for _ in range(8): + self.wfile.write(chunk_header) + self.wfile.write(chunk) + self.wfile.write(b'\r\n') + self.wfile.write(b'0\r\n\r\n') + + url, httpd, _ = _serve(_Handler) + try: + src = _HTTPSource(url) + with pytest.raises(OSError, match="byte budget|exceeded"): + src.read_range(0, 64) + finally: + _stop(httpd) + + +def test_range_request_uses_streaming_response(monkeypatch): + """``read_range`` must request the body with ``preload_content= + False`` so urllib3 hands back a streaming response instead of + buffering ``resp.data`` up front. + + This pins the wire-level behaviour the OOM fix depends on. If a + future refactor flips the default back to ``preload_content= + True``, the streaming cap and the ``Content-Length`` preflight + both become advisory rather than enforcing. Issue #2264. + """ + + captured: dict = {} + + class _FakeResp: + def __init__(self, body): + self.status = 206 + self._body = body + self.headers = { + 'Content-Length': str(len(body)), + 'Content-Range': f'bytes 0-{len(body) - 1}/64', + } + + def stream(self, amt=65536, decode_content=True): + if self._body: + yield self._body + + def release_conn(self): + pass + + class _FakePool: + def request(self, method, url, headers=None, timeout=None, + redirect=None, preload_content=True): + captured['preload_content'] = preload_content + captured['headers'] = headers + return _FakeResp(b'\x01' * 16) + + src = _HTTPSource('http://127.0.0.1:65535/x.bin') + monkeypatch.setattr(src, '_pool', _FakePool()) + out = src.read_range(0, 16) + assert out == b'\x01' * 16 + # The hard contract: the GET went out asking for a streaming body. + assert captured['preload_content'] is False + assert captured['headers'] == {'Range': 'bytes=0-15'} diff --git a/xrspatial/geotiff/tests/test_ssrf_hardening_1664.py b/xrspatial/geotiff/tests/test_ssrf_hardening_1664.py index 78d9e2ff..9336aa68 100644 --- a/xrspatial/geotiff/tests/test_ssrf_hardening_1664.py +++ b/xrspatial/geotiff/tests/test_ssrf_hardening_1664.py @@ -240,6 +240,23 @@ def __init__(self, status: int, location: str | None = None, self.status = status self.headers = {'Location': location} if location else {} self.data = data + self._body = data + # ``read_range`` (post #2264) does a Content-Length preflight and + # streams the body with ``_read_capped``. Pin Content-Length to + # the body size so the preflight passes; the streaming cap then + # reads ``data`` via ``stream()`` below. + if data: + self.headers['Content-Length'] = str(len(data)) + + def stream(self, amt=65536, decode_content=True): + if self._body: + yield self._body + + def release_conn(self): + pass + + def drain_conn(self): + pass class _MockPool: