From 3c736c01f4e359dc7c79b3f0660acc23a58f59f8 Mon Sep 17 00:00:00 2001
From: Yufeng He <40085740+he-yufeng@users.noreply.github.com>
Date: Fri, 5 Jun 2026 06:16:50 +0800
Subject: [PATCH] fix(fetch): add python-only readability fallback

Signed-off-by: Yufeng He <40085740+he-yufeng@users.noreply.github.com>
---
 src/fetch/README.md                        |   6 +-
 src/fetch/src/mcp_server_fetch/__init__.py |  14 +-
 src/fetch/src/mcp_server_fetch/server.py   | 102 ++++++++++-----
 src/fetch/tests/test_server.py             | 144 ++++++++++++++++-----
 4 files changed, 200 insertions(+), 66 deletions(-)
diff --git a/src/fetch/README.md b/src/fetch/README.md
index 2c3e048927..ce105d5f09 100644
--- a/src/fetch/README.md
+++ b/src/fetch/README.md
@@ -26,7 +26,7 @@ The fetch tool will truncate the response, but by using the `start_index` argume
 
 ## Installation
 
-Optionally: Install node.js, this will cause the fetch server to use a different HTML simplifier that is more robust.
+Optionally: Install node.js, this will cause the fetch server to use a different HTML simplifier that is more robust. If `node` is not available, the server falls back to the Python-only simplifier.
 
 ### Using uv (recommended)
 
@@ -170,6 +170,10 @@ This can be customized by adding the argument `--user-agent=YourUserAgent` to th
 
 The server can be configured to use a proxy by using the `--proxy-url` argument.
 
+### Customization - HTML simplifier
+
+By default, the server uses the Node.js readability simplifier when `node` is available on `PATH`, and otherwise falls back to readabilipy's Python-only simplifier. If a host has Node.js installed but the readability path is slow or misconfigured, add `--no-readability-js` to force the Python-only simplifier.
+
 ## Windows Configuration
 
 If you're experiencing timeout issues on Windows, you may need to set the `PYTHONIOENCODING` environment variable to ensure proper character encoding:
diff --git a/src/fetch/src/mcp_server_fetch/__init__.py b/src/fetch/src/mcp_server_fetch/__init__.py
index 09744ce319..a0451ecc05 100644
--- a/src/fetch/src/mcp_server_fetch/__init__.py
+++ b/src/fetch/src/mcp_server_fetch/__init__.py
@@ -16,9 +16,21 @@ def main():
         help="Ignore robots.txt restrictions",
     )
     parser.add_argument("--proxy-url", type=str, help="Proxy URL to use for requests")
+    parser.add_argument(
+        "--no-readability-js",
+        action="store_true",
+        help="Use readabilipy's Python-only HTML simplifier instead of the optional Node.js readability path",
+    )
 
     args = parser.parse_args()
-    asyncio.run(serve(args.user_agent, args.ignore_robots_txt, args.proxy_url))
+    asyncio.run(
+        serve(
+            args.user_agent,
+            args.ignore_robots_txt,
+            args.proxy_url,
+            use_readability_js=False if args.no_readability_js else None,
+        )
+    )
 
 
 if __name__ == "__main__":
diff --git a/src/fetch/src/mcp_server_fetch/server.py b/src/fetch/src/mcp_server_fetch/server.py
index b42c7b1f6b..cc745f903a 100644
--- a/src/fetch/src/mcp_server_fetch/server.py
+++ b/src/fetch/src/mcp_server_fetch/server.py
@@ -1,3 +1,4 @@
+import shutil
 from typing import Annotated, Tuple
 from urllib.parse import urlparse, urlunparse
 
@@ -24,7 +25,7 @@
 DEFAULT_USER_AGENT_MANUAL = "ModelContextProtocol/1.0 (User-Specified; +https://github.com/modelcontextprotocol/servers)"
 
 
-def extract_content_from_html(html: str) -> str:
+def extract_content_from_html(html: str, use_readability_js: bool | None = None) -> str:
     """Extract and convert HTML content to Markdown format.
 
     Args:
@@ -33,8 +34,11 @@ def extract_content_from_html(html: str) -> str:
     Returns:
         Simplified markdown version of the content
     """
+    if use_readability_js is None:
+        use_readability_js = shutil.which("node") is not None
+
     ret = readabilipy.simple_json.simple_json_from_html_string(
-        html, use_readability=True
+        html, use_readability=use_readability_js
     )
     if not ret["content"]:
         return "<error>Page failed to be simplified from HTML</error>"
@@ -63,7 +67,9 @@ def get_robots_txt_url(url: str) -> str:
     return robots_url
 
 
-async def check_may_autonomously_fetch_url(url: str, user_agent: str, proxy_url: str | None = None) -> None:
+async def check_may_autonomously_fetch_url(
+    url: str, user_agent: str, proxy_url: str | None = None
+) -> None:
     """
     Check if the URL can be fetched by the user agent according to the robots.txt file.
     Raises a McpError if not.
@@ -80,15 +86,19 @@ async def check_may_autonomously_fetch_url(url: str, user_agent: str, proxy_url:
                 headers={"User-Agent": user_agent},
             )
         except HTTPError:
-            raise McpError(ErrorData(
-                code=INTERNAL_ERROR,
-                message=f"Failed to fetch robots.txt {robot_txt_url} due to a connection issue",
-            ))
+            raise McpError(
+                ErrorData(
+                    code=INTERNAL_ERROR,
+                    message=f"Failed to fetch robots.txt {robot_txt_url} due to a connection issue",
+                )
+            )
         if response.status_code in (401, 403):
-            raise McpError(ErrorData(
-                code=INTERNAL_ERROR,
-                message=f"When fetching robots.txt ({robot_txt_url}), received status {response.status_code} so assuming that autonomous fetching is not allowed, the user can try manually fetching by using the fetch prompt",
-            ))
+            raise McpError(
+                ErrorData(
+                    code=INTERNAL_ERROR,
+                    message=f"When fetching robots.txt ({robot_txt_url}), received status {response.status_code} so assuming that autonomous fetching is not allowed, the user can try manually fetching by using the fetch prompt",
+                )
+            )
         elif 400 <= response.status_code < 500:
             return
         robot_txt = response.text
@@ -97,19 +107,25 @@ async def check_may_autonomously_fetch_url(url: str, user_agent: str, proxy_url:
     )
     robot_parser = Protego.parse(processed_robot_txt)
     if not robot_parser.can_fetch(str(url), user_agent):
-        raise McpError(ErrorData(
-            code=INTERNAL_ERROR,
-            message=f"The sites robots.txt ({robot_txt_url}), specifies that autonomous fetching of this page is not allowed, "
-            f"<useragent>{user_agent}</useragent>\n"
-            f"<url>{url}</url>"
-            f"<robots>\n{robot_txt}\n</robots>\n"
-            f"The assistant must let the user know that it failed to view the page. The assistant may provide further guidance based on the above information.\n"
-            f"The assistant can tell the user that they can try manually fetching the page by using the fetch prompt within their UI.",
-        ))
+        raise McpError(
+            ErrorData(
+                code=INTERNAL_ERROR,
+                message=f"The sites robots.txt ({robot_txt_url}), specifies that autonomous fetching of this page is not allowed, "
+                f"<useragent>{user_agent}</useragent>\n"
+                f"<url>{url}</url>"
+                f"<robots>\n{robot_txt}\n</robots>\n"
+                f"The assistant must let the user know that it failed to view the page. The assistant may provide further guidance based on the above information.\n"
+                f"The assistant can tell the user that they can try manually fetching the page by using the fetch prompt within their UI.",
+            )
+        )
 
 
 async def fetch_url(
-    url: str, user_agent: str, force_raw: bool = False, proxy_url: str | None = None
+    url: str,
+    user_agent: str,
+    force_raw: bool = False,
+    proxy_url: str | None = None,
+    use_readability_js: bool | None = None,
 ) -> Tuple[str, str]:
     """
     Fetch the URL and return the content in a form ready for the LLM, as well as a prefix string with status information.
@@ -125,12 +141,16 @@ async def fetch_url(
                 timeout=30,
             )
         except HTTPError as e:
-            raise McpError(ErrorData(code=INTERNAL_ERROR, message=f"Failed to fetch {url}: {e!r}"))
+            raise McpError(
+                ErrorData(code=INTERNAL_ERROR, message=f"Failed to fetch {url}: {e!r}")
+            )
         if response.status_code >= 400:
-            raise McpError(ErrorData(
-                code=INTERNAL_ERROR,
-                message=f"Failed to fetch {url} - status code {response.status_code}",
-            ))
+            raise McpError(
+                ErrorData(
+                    code=INTERNAL_ERROR,
+                    message=f"Failed to fetch {url} - status code {response.status_code}",
+                )
+            )
 
         page_raw = response.text
 
@@ -140,7 +160,9 @@ async def fetch_url(
     )
 
     if is_page_html and not force_raw:
-        return extract_content_from_html(page_raw), ""
+        return extract_content_from_html(
+            page_raw, use_readability_js=use_readability_js
+        ), ""
 
     return (
         page_raw,
@@ -182,6 +204,7 @@ async def serve(
     custom_user_agent: str | None = None,
     ignore_robots_txt: bool = False,
     proxy_url: str | None = None,
+    use_readability_js: bool | None = None,
 ) -> None:
     """Run the fetch MCP server.
 
@@ -232,22 +255,32 @@ async def call_tool(name, arguments: dict) -> list[TextContent]:
             raise McpError(ErrorData(code=INVALID_PARAMS, message="URL is required"))
 
         if not ignore_robots_txt:
-            await check_may_autonomously_fetch_url(url, user_agent_autonomous, proxy_url)
+            await check_may_autonomously_fetch_url(
+                url, user_agent_autonomous, proxy_url
+            )
 
         content, prefix = await fetch_url(
-            url, user_agent_autonomous, force_raw=args.raw, proxy_url=proxy_url
+            url,
+            user_agent_autonomous,
+            force_raw=args.raw,
+            proxy_url=proxy_url,
+            use_readability_js=use_readability_js,
         )
         original_length = len(content)
         if args.start_index >= original_length:
             content = "<error>No more content available.</error>"
         else:
-            truncated_content = content[args.start_index : args.start_index + args.max_length]
+            truncated_content = content[
+                args.start_index : args.start_index + args.max_length
+            ]
             if not truncated_content:
                 content = "<error>No more content available.</error>"
             else:
                 content = truncated_content
                 actual_content_length = len(truncated_content)
-                remaining_content = original_length - (args.start_index + actual_content_length)
+                remaining_content = original_length - (
+                    args.start_index + actual_content_length
+                )
                 # Only add the prompt to continue fetching if there is still remaining content
                 if actual_content_length == args.max_length and remaining_content > 0:
                     next_start = args.start_index + actual_content_length
@@ -262,7 +295,12 @@ async def get_prompt(name: str, arguments: dict | None) -> GetPromptResult:
         url = arguments["url"]
 
         try:
-            content, prefix = await fetch_url(url, user_agent_manual, proxy_url=proxy_url)
+            content, prefix = await fetch_url(
+                url,
+                user_agent_manual,
+                proxy_url=proxy_url,
+                use_readability_js=use_readability_js,
+            )
             # TODO: after SDK bug is addressed, don't catch the exception
         except McpError as e:
             return GetPromptResult(
diff --git a/src/fetch/tests/test_server.py b/src/fetch/tests/test_server.py
index 96c1cb38c7..237637a2aa 100644
--- a/src/fetch/tests/test_server.py
+++ b/src/fetch/tests/test_server.py
@@ -87,6 +87,35 @@ def test_empty_content_returns_error(self):
         result = extract_content_from_html(html)
         assert "<error>" in result
 
+    def test_can_disable_readability_js(self):
+        """Test that callers can force the Python-only extraction path."""
+        html = "<html><body><article><p>Hello World</p></article></body></html>"
+
+        with patch(
+            "readabilipy.simple_json.simple_json_from_html_string",
+            return_value={"content": "<p>Hello World</p>"},
+        ) as mock_extract:
+            result = extract_content_from_html(html, use_readability_js=False)
+
+        assert "Hello World" in result
+        mock_extract.assert_called_once_with(html, use_readability=False)
+
+    def test_auto_disables_readability_js_without_node(self):
+        """Test that Python-only environments do not try the Node.js path."""
+        html = "<html><body><article><p>Hello World</p></article></body></html>"
+
+        with (
+            patch("shutil.which", return_value=None),
+            patch(
+                "readabilipy.simple_json.simple_json_from_html_string",
+                return_value={"content": "<p>Hello World</p>"},
+            ) as mock_extract,
+        ):
+            result = extract_content_from_html(html)
+
+        assert "Hello World" in result
+        mock_extract.assert_called_once_with(html, use_readability=False)
+
 
 class TestCheckMayAutonomouslyFetchUrl:
     """Tests for check_may_autonomously_fetch_url function."""
@@ -100,13 +129,14 @@ async def test_allows_when_robots_txt_404(self):
         with patch("httpx.AsyncClient") as mock_client_class:
             mock_client = AsyncMock()
             mock_client.get = AsyncMock(return_value=mock_response)
-            mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
+            mock_client_class.return_value.__aenter__ = AsyncMock(
+                return_value=mock_client
+            )
             mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
 
             # Should not raise
             await check_may_autonomously_fetch_url(
-                "https://example.com/page",
-                DEFAULT_USER_AGENT_AUTONOMOUS
+                "https://example.com/page", DEFAULT_USER_AGENT_AUTONOMOUS
             )
 
     @pytest.mark.asyncio
@@ -118,13 +148,14 @@ async def test_blocks_when_robots_txt_401(self):
         with patch("httpx.AsyncClient") as mock_client_class:
             mock_client = AsyncMock()
             mock_client.get = AsyncMock(return_value=mock_response)
-            mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
+            mock_client_class.return_value.__aenter__ = AsyncMock(
+                return_value=mock_client
+            )
             mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
 
             with pytest.raises(McpError):
                 await check_may_autonomously_fetch_url(
-                    "https://example.com/page",
-                    DEFAULT_USER_AGENT_AUTONOMOUS
+                    "https://example.com/page", DEFAULT_USER_AGENT_AUTONOMOUS
                 )
 
     @pytest.mark.asyncio
@@ -136,13 +167,14 @@ async def test_blocks_when_robots_txt_403(self):
         with patch("httpx.AsyncClient") as mock_client_class:
             mock_client = AsyncMock()
             mock_client.get = AsyncMock(return_value=mock_response)
-            mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
+            mock_client_class.return_value.__aenter__ = AsyncMock(
+                return_value=mock_client
+            )
             mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
 
             with pytest.raises(McpError):
                 await check_may_autonomously_fetch_url(
-                    "https://example.com/page",
-                    DEFAULT_USER_AGENT_AUTONOMOUS
+                    "https://example.com/page", DEFAULT_USER_AGENT_AUTONOMOUS
                 )
 
     @pytest.mark.asyncio
@@ -155,13 +187,14 @@ async def test_allows_when_robots_txt_allows_all(self):
         with patch("httpx.AsyncClient") as mock_client_class:
             mock_client = AsyncMock()
             mock_client.get = AsyncMock(return_value=mock_response)
-            mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
+            mock_client_class.return_value.__aenter__ = AsyncMock(
+                return_value=mock_client
+            )
             mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
 
             # Should not raise
             await check_may_autonomously_fetch_url(
-                "https://example.com/page",
-                DEFAULT_USER_AGENT_AUTONOMOUS
+                "https://example.com/page", DEFAULT_USER_AGENT_AUTONOMOUS
             )
 
     @pytest.mark.asyncio
@@ -174,13 +207,14 @@ async def test_blocks_when_robots_txt_disallows_all(self):
         with patch("httpx.AsyncClient") as mock_client_class:
             mock_client = AsyncMock()
             mock_client.get = AsyncMock(return_value=mock_response)
-            mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
+            mock_client_class.return_value.__aenter__ = AsyncMock(
+                return_value=mock_client
+            )
             mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
 
             with pytest.raises(McpError):
                 await check_may_autonomously_fetch_url(
-                    "https://example.com/page",
-                    DEFAULT_USER_AGENT_AUTONOMOUS
+                    "https://example.com/page", DEFAULT_USER_AGENT_AUTONOMOUS
                 )
 
 
@@ -207,12 +241,13 @@ async def test_fetch_html_page(self):
         with patch("httpx.AsyncClient") as mock_client_class:
             mock_client = AsyncMock()
             mock_client.get = AsyncMock(return_value=mock_response)
-            mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
+            mock_client_class.return_value.__aenter__ = AsyncMock(
+                return_value=mock_client
+            )
             mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
 
             content, prefix = await fetch_url(
-                "https://example.com/page",
-                DEFAULT_USER_AGENT_AUTONOMOUS
+                "https://example.com/page", DEFAULT_USER_AGENT_AUTONOMOUS
             )
 
             # HTML is processed, so we check it returns something
@@ -231,13 +266,15 @@ async def test_fetch_html_page_raw(self):
         with patch("httpx.AsyncClient") as mock_client_class:
             mock_client = AsyncMock()
             mock_client.get = AsyncMock(return_value=mock_response)
-            mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
+            mock_client_class.return_value.__aenter__ = AsyncMock(
+                return_value=mock_client
+            )
             mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
 
             content, prefix = await fetch_url(
                 "https://example.com/page",
                 DEFAULT_USER_AGENT_AUTONOMOUS,
-                force_raw=True
+                force_raw=True,
             )
 
             assert content == html_content
@@ -255,12 +292,13 @@ async def test_fetch_json_returns_raw(self):
         with patch("httpx.AsyncClient") as mock_client_class:
             mock_client = AsyncMock()
             mock_client.get = AsyncMock(return_value=mock_response)
-            mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
+            mock_client_class.return_value.__aenter__ = AsyncMock(
+                return_value=mock_client
+            )
             mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
 
             content, prefix = await fetch_url(
-                "https://api.example.com/data",
-                DEFAULT_USER_AGENT_AUTONOMOUS
+                "https://api.example.com/data", DEFAULT_USER_AGENT_AUTONOMOUS
             )
 
             assert content == json_content
@@ -275,13 +313,14 @@ async def test_fetch_404_raises_error(self):
         with patch("httpx.AsyncClient") as mock_client_class:
             mock_client = AsyncMock()
             mock_client.get = AsyncMock(return_value=mock_response)
-            mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
+            mock_client_class.return_value.__aenter__ = AsyncMock(
+                return_value=mock_client
+            )
             mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
 
             with pytest.raises(McpError):
                 await fetch_url(
-                    "https://example.com/notfound",
-                    DEFAULT_USER_AGENT_AUTONOMOUS
+                    "https://example.com/notfound", DEFAULT_USER_AGENT_AUTONOMOUS
                 )
 
     @pytest.mark.asyncio
@@ -293,13 +332,14 @@ async def test_fetch_500_raises_error(self):
         with patch("httpx.AsyncClient") as mock_client_class:
             mock_client = AsyncMock()
             mock_client.get = AsyncMock(return_value=mock_response)
-            mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
+            mock_client_class.return_value.__aenter__ = AsyncMock(
+                return_value=mock_client
+            )
             mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
 
             with pytest.raises(McpError):
                 await fetch_url(
-                    "https://example.com/error",
-                    DEFAULT_USER_AGENT_AUTONOMOUS
+                    "https://example.com/error", DEFAULT_USER_AGENT_AUTONOMOUS
                 )
 
     @pytest.mark.asyncio
@@ -313,14 +353,54 @@ async def test_fetch_with_proxy(self):
         with patch("httpx.AsyncClient") as mock_client_class:
             mock_client = AsyncMock()
             mock_client.get = AsyncMock(return_value=mock_response)
-            mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
+            mock_client_class.return_value.__aenter__ = AsyncMock(
+                return_value=mock_client
+            )
             mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
 
             await fetch_url(
                 "https://example.com/data",
                 DEFAULT_USER_AGENT_AUTONOMOUS,
-                proxy_url="http://proxy.example.com:8080"
+                proxy_url="http://proxy.example.com:8080",
             )
 
             # Verify AsyncClient was called with proxy
-            mock_client_class.assert_called_once_with(proxy="http://proxy.example.com:8080")
+            mock_client_class.assert_called_once_with(
+                proxy="http://proxy.example.com:8080"
+            )
+
+    @pytest.mark.asyncio
+    async def test_fetch_html_can_disable_readability_js(self):
+        """Test that fetch_url forwards the readability-js preference."""
+        mock_response = MagicMock()
+        mock_response.status_code = 200
+        mock_response.text = (
+            "<html><body><article><p>Hello World</p></article></body></html>"
+        )
+        mock_response.headers = {"content-type": "text/html"}
+
+        with (
+            patch("httpx.AsyncClient") as mock_client_class,
+            patch(
+                "mcp_server_fetch.server.extract_content_from_html",
+                return_value="Hello World",
+            ) as mock_extract,
+        ):
+            mock_client = AsyncMock()
+            mock_client.get = AsyncMock(return_value=mock_response)
+            mock_client_class.return_value.__aenter__ = AsyncMock(
+                return_value=mock_client
+            )
+            mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
+
+            content, prefix = await fetch_url(
+                "https://example.com/page",
+                DEFAULT_USER_AGENT_AUTONOMOUS,
+                use_readability_js=False,
+            )
+
+        assert content == "Hello World"
+        assert prefix == ""
+        mock_extract.assert_called_once_with(
+            mock_response.text, use_readability_js=False
+        )