Skip to content

Commit 8f4a099

Browse files
Decode bytes-like tool response text fields safely
Co-authored-by: Shri Sukhani <shrisukhani@users.noreply.github.com>
1 parent 679a3e2 commit 8f4a099

2 files changed

Lines changed: 119 additions & 18 deletions

File tree

hyperbrowser/tools/__init__.py

Lines changed: 39 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,26 @@ def _serialize_extract_tool_data(data: Any) -> str:
163163
) from exc
164164

165165

166+
def _normalize_optional_text_field_value(
167+
field_value: Any,
168+
*,
169+
error_message: str,
170+
) -> str:
171+
if field_value is None:
172+
return ""
173+
if isinstance(field_value, str):
174+
return field_value
175+
if isinstance(field_value, (bytes, bytearray, memoryview)):
176+
try:
177+
return memoryview(field_value).tobytes().decode("utf-8")
178+
except (TypeError, ValueError, UnicodeDecodeError) as exc:
179+
raise HyperbrowserError(
180+
error_message,
181+
original_error=exc,
182+
) from exc
183+
raise HyperbrowserError(error_message)
184+
185+
166186
def _read_tool_response_data(response: Any, *, tool_name: str) -> Any:
167187
if isinstance(response, MappingABC):
168188
try:
@@ -232,11 +252,12 @@ def _read_optional_tool_response_field(
232252
) from exc
233253
if field_value is None:
234254
return ""
235-
if not isinstance(field_value, str):
236-
raise HyperbrowserError(
237-
f"{tool_name} response field '{field_name}' must be a string"
238-
)
239-
return field_value
255+
return _normalize_optional_text_field_value(
256+
field_value,
257+
error_message=(
258+
f"{tool_name} response field '{field_name}' must be a UTF-8 string"
259+
),
260+
)
240261

241262

242263
def _read_crawl_page_field(page: Any, *, field_name: str, page_index: int) -> Any:
@@ -290,20 +311,25 @@ def _render_crawl_markdown_output(response_data: Any) -> str:
290311
)
291312
if page_markdown is None:
292313
continue
293-
if not isinstance(page_markdown, str):
294-
raise HyperbrowserError(
295-
f"crawl tool page field 'markdown' must be a string at index {index}"
296-
)
314+
page_markdown = _normalize_optional_text_field_value(
315+
page_markdown,
316+
error_message=(
317+
"crawl tool page field 'markdown' must be a UTF-8 string "
318+
f"at index {index}"
319+
),
320+
)
297321
if not page_markdown:
298322
continue
299323
page_url = _read_crawl_page_field(page, field_name="url", page_index=index)
300324
if page_url is None:
301325
page_url_display = "<unknown url>"
302-
elif not isinstance(page_url, str):
303-
raise HyperbrowserError(
304-
f"crawl tool page field 'url' must be a string at index {index}"
305-
)
306326
else:
327+
page_url = _normalize_optional_text_field_value(
328+
page_url,
329+
error_message=(
330+
f"crawl tool page field 'url' must be a UTF-8 string at index {index}"
331+
),
332+
)
307333
page_url_display = page_url if page_url.strip() else "<unknown url>"
308334
markdown_sections.append(
309335
f"\n{'-' * 50}\nUrl: {page_url_display}\nMarkdown:\n{page_markdown}\n"

tests/test_tools_response_handling.py

Lines changed: 80 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -269,11 +269,33 @@ def test_scrape_tool_rejects_non_string_markdown_field():
269269

270270
with pytest.raises(
271271
HyperbrowserError,
272-
match="scrape tool response field 'markdown' must be a string",
272+
match="scrape tool response field 'markdown' must be a UTF-8 string",
273273
):
274274
WebsiteScrapeTool.runnable(client, {"url": "https://example.com"})
275275

276276

277+
def test_scrape_tool_decodes_utf8_bytes_markdown_field():
278+
client = _SyncScrapeClient(_Response(data=SimpleNamespace(markdown=b"hello")))
279+
280+
output = WebsiteScrapeTool.runnable(client, {"url": "https://example.com"})
281+
282+
assert output == "hello"
283+
284+
285+
def test_scrape_tool_wraps_invalid_utf8_markdown_bytes():
286+
client = _SyncScrapeClient(
287+
_Response(data=SimpleNamespace(markdown=b"\xff\xfe\xfd"))
288+
)
289+
290+
with pytest.raises(
291+
HyperbrowserError,
292+
match="scrape tool response field 'markdown' must be a UTF-8 string",
293+
) as exc_info:
294+
WebsiteScrapeTool.runnable(client, {"url": "https://example.com"})
295+
296+
assert exc_info.value.original_error is not None
297+
298+
277299
def test_scrape_tool_supports_mapping_response_data():
278300
client = _SyncScrapeClient(_Response(data={"markdown": "from mapping"}))
279301

@@ -318,11 +340,19 @@ def test_screenshot_tool_rejects_non_string_screenshot_field():
318340

319341
with pytest.raises(
320342
HyperbrowserError,
321-
match="screenshot tool response field 'screenshot' must be a string",
343+
match="screenshot tool response field 'screenshot' must be a UTF-8 string",
322344
):
323345
WebsiteScreenshotTool.runnable(client, {"url": "https://example.com"})
324346

325347

348+
def test_screenshot_tool_decodes_utf8_bytes_field():
349+
client = _SyncScrapeClient(_Response(data=SimpleNamespace(screenshot=b"image-data")))
350+
351+
output = WebsiteScreenshotTool.runnable(client, {"url": "https://example.com"})
352+
353+
assert output == "image-data"
354+
355+
326356
def test_crawl_tool_rejects_non_list_response_data():
327357
client = _SyncCrawlClient(_Response(data={"invalid": "payload"}))
328358

@@ -407,11 +437,36 @@ def test_crawl_tool_rejects_non_string_page_urls():
407437

408438
with pytest.raises(
409439
HyperbrowserError,
410-
match="crawl tool page field 'url' must be a string at index 0",
440+
match="crawl tool page field 'url' must be a UTF-8 string at index 0",
411441
):
412442
WebsiteCrawlTool.runnable(client, {"url": "https://example.com"})
413443

414444

445+
def test_crawl_tool_decodes_utf8_bytes_page_fields():
446+
client = _SyncCrawlClient(
447+
_Response(data=[SimpleNamespace(url=b"https://example.com", markdown=b"page")])
448+
)
449+
450+
output = WebsiteCrawlTool.runnable(client, {"url": "https://example.com"})
451+
452+
assert "Url: https://example.com" in output
453+
assert "page" in output
454+
455+
456+
def test_crawl_tool_wraps_invalid_utf8_page_field_bytes():
457+
client = _SyncCrawlClient(
458+
_Response(data=[SimpleNamespace(url=b"\xff", markdown="body")])
459+
)
460+
461+
with pytest.raises(
462+
HyperbrowserError,
463+
match="crawl tool page field 'url' must be a UTF-8 string at index 0",
464+
) as exc_info:
465+
WebsiteCrawlTool.runnable(client, {"url": "https://example.com"})
466+
467+
assert exc_info.value.original_error is not None
468+
469+
415470
def test_crawl_tool_uses_unknown_url_for_blank_page_urls():
416471
client = _SyncCrawlClient(
417472
_Response(data=[SimpleNamespace(url=" ", markdown="page body")])
@@ -443,11 +498,19 @@ def test_browser_use_tool_rejects_non_string_final_result():
443498

444499
with pytest.raises(
445500
HyperbrowserError,
446-
match="browser-use tool response field 'final_result' must be a string",
501+
match="browser-use tool response field 'final_result' must be a UTF-8 string",
447502
):
448503
BrowserUseTool.runnable(client, {"task": "search docs"})
449504

450505

506+
def test_browser_use_tool_decodes_utf8_bytes_final_result():
507+
client = _SyncBrowserUseClient(_Response(data=SimpleNamespace(final_result=b"done")))
508+
509+
output = BrowserUseTool.runnable(client, {"task": "search docs"})
510+
511+
assert output == "done"
512+
513+
451514
def test_browser_use_tool_supports_mapping_response_data():
452515
client = _SyncBrowserUseClient(_Response(data={"final_result": "mapping output"}))
453516

@@ -520,7 +583,7 @@ async def run() -> None:
520583
)
521584
with pytest.raises(
522585
HyperbrowserError,
523-
match="browser-use tool response field 'final_result' must be a string",
586+
match="browser-use tool response field 'final_result' must be a UTF-8 string",
524587
):
525588
await BrowserUseTool.async_runnable(client, {"task": "search docs"})
526589

@@ -539,6 +602,18 @@ async def run() -> None:
539602
asyncio.run(run())
540603

541604

605+
def test_async_scrape_tool_decodes_utf8_bytes_markdown_field():
606+
async def run() -> None:
607+
client = _AsyncScrapeClient(_Response(data=SimpleNamespace(markdown=b"async")))
608+
output = await WebsiteScrapeTool.async_runnable(
609+
client,
610+
{"url": "https://example.com"},
611+
)
612+
assert output == "async"
613+
614+
asyncio.run(run())
615+
616+
542617
def test_async_crawl_tool_supports_mapping_page_items():
543618
async def run() -> None:
544619
client = _AsyncCrawlClient(

0 commit comments

Comments
 (0)