Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion src/fetch/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ The fetch tool will truncate the response, but by using the `start_index` argume

## Installation

Optionally: Install node.js, this will cause the fetch server to use a different HTML simplifier that is more robust.
Optionally: Install node.js, this will cause the fetch server to use a different HTML simplifier that is more robust. If `node` is not available, the server falls back to the Python-only simplifier.

### Using uv (recommended)

Expand Down Expand Up @@ -170,6 +170,10 @@ This can be customized by adding the argument `--user-agent=YourUserAgent` to th

The server can be configured to use a proxy by using the `--proxy-url` argument.

### Customization - HTML simplifier

By default, the server uses the Node.js readability simplifier when `node` is available on `PATH`, and otherwise falls back to readabilipy's Python-only simplifier. If a host has Node.js installed but the readability path is slow or misconfigured, add `--no-readability-js` to force the Python-only simplifier.

## Windows Configuration

If you're experiencing timeout issues on Windows, you may need to set the `PYTHONIOENCODING` environment variable to ensure proper character encoding:
Expand Down
14 changes: 13 additions & 1 deletion src/fetch/src/mcp_server_fetch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,21 @@ def main():
help="Ignore robots.txt restrictions",
)
parser.add_argument("--proxy-url", type=str, help="Proxy URL to use for requests")
parser.add_argument(
"--no-readability-js",
action="store_true",
help="Use readabilipy's Python-only HTML simplifier instead of the optional Node.js readability path",
)

args = parser.parse_args()
asyncio.run(serve(args.user_agent, args.ignore_robots_txt, args.proxy_url))
asyncio.run(
serve(
args.user_agent,
args.ignore_robots_txt,
args.proxy_url,
use_readability_js=False if args.no_readability_js else None,
)
)


if __name__ == "__main__":
Expand Down
102 changes: 70 additions & 32 deletions src/fetch/src/mcp_server_fetch/server.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import shutil
from typing import Annotated, Tuple
from urllib.parse import urlparse, urlunparse

Expand All @@ -24,7 +25,7 @@
DEFAULT_USER_AGENT_MANUAL = "ModelContextProtocol/1.0 (User-Specified; +https://github.com/modelcontextprotocol/servers)"


def extract_content_from_html(html: str) -> str:
def extract_content_from_html(html: str, use_readability_js: bool | None = None) -> str:
"""Extract and convert HTML content to Markdown format.

Args:
Expand All @@ -33,8 +34,11 @@ def extract_content_from_html(html: str) -> str:
Returns:
Simplified markdown version of the content
"""
if use_readability_js is None:
use_readability_js = shutil.which("node") is not None

ret = readabilipy.simple_json.simple_json_from_html_string(
html, use_readability=True
html, use_readability=use_readability_js
)
if not ret["content"]:
return "<error>Page failed to be simplified from HTML</error>"
Expand Down Expand Up @@ -63,7 +67,9 @@ def get_robots_txt_url(url: str) -> str:
return robots_url


async def check_may_autonomously_fetch_url(url: str, user_agent: str, proxy_url: str | None = None) -> None:
async def check_may_autonomously_fetch_url(
url: str, user_agent: str, proxy_url: str | None = None
) -> None:
"""
Check if the URL can be fetched by the user agent according to the robots.txt file.
Raises a McpError if not.
Expand All @@ -80,15 +86,19 @@ async def check_may_autonomously_fetch_url(url: str, user_agent: str, proxy_url:
headers={"User-Agent": user_agent},
)
except HTTPError:
raise McpError(ErrorData(
code=INTERNAL_ERROR,
message=f"Failed to fetch robots.txt {robot_txt_url} due to a connection issue",
))
raise McpError(
ErrorData(
code=INTERNAL_ERROR,
message=f"Failed to fetch robots.txt {robot_txt_url} due to a connection issue",
)
)
if response.status_code in (401, 403):
raise McpError(ErrorData(
code=INTERNAL_ERROR,
message=f"When fetching robots.txt ({robot_txt_url}), received status {response.status_code} so assuming that autonomous fetching is not allowed, the user can try manually fetching by using the fetch prompt",
))
raise McpError(
ErrorData(
code=INTERNAL_ERROR,
message=f"When fetching robots.txt ({robot_txt_url}), received status {response.status_code} so assuming that autonomous fetching is not allowed, the user can try manually fetching by using the fetch prompt",
)
)
elif 400 <= response.status_code < 500:
return
robot_txt = response.text
Expand All @@ -97,19 +107,25 @@ async def check_may_autonomously_fetch_url(url: str, user_agent: str, proxy_url:
)
robot_parser = Protego.parse(processed_robot_txt)
if not robot_parser.can_fetch(str(url), user_agent):
raise McpError(ErrorData(
code=INTERNAL_ERROR,
message=f"The sites robots.txt ({robot_txt_url}), specifies that autonomous fetching of this page is not allowed, "
f"<useragent>{user_agent}</useragent>\n"
f"<url>{url}</url>"
f"<robots>\n{robot_txt}\n</robots>\n"
f"The assistant must let the user know that it failed to view the page. The assistant may provide further guidance based on the above information.\n"
f"The assistant can tell the user that they can try manually fetching the page by using the fetch prompt within their UI.",
))
raise McpError(
ErrorData(
code=INTERNAL_ERROR,
message=f"The sites robots.txt ({robot_txt_url}), specifies that autonomous fetching of this page is not allowed, "
f"<useragent>{user_agent}</useragent>\n"
f"<url>{url}</url>"
f"<robots>\n{robot_txt}\n</robots>\n"
f"The assistant must let the user know that it failed to view the page. The assistant may provide further guidance based on the above information.\n"
f"The assistant can tell the user that they can try manually fetching the page by using the fetch prompt within their UI.",
)
)


async def fetch_url(
url: str, user_agent: str, force_raw: bool = False, proxy_url: str | None = None
url: str,
user_agent: str,
force_raw: bool = False,
proxy_url: str | None = None,
use_readability_js: bool | None = None,
) -> Tuple[str, str]:
"""
Fetch the URL and return the content in a form ready for the LLM, as well as a prefix string with status information.
Expand All @@ -125,12 +141,16 @@ async def fetch_url(
timeout=30,
)
except HTTPError as e:
raise McpError(ErrorData(code=INTERNAL_ERROR, message=f"Failed to fetch {url}: {e!r}"))
raise McpError(
ErrorData(code=INTERNAL_ERROR, message=f"Failed to fetch {url}: {e!r}")
)
if response.status_code >= 400:
raise McpError(ErrorData(
code=INTERNAL_ERROR,
message=f"Failed to fetch {url} - status code {response.status_code}",
))
raise McpError(
ErrorData(
code=INTERNAL_ERROR,
message=f"Failed to fetch {url} - status code {response.status_code}",
)
)

page_raw = response.text

Expand All @@ -140,7 +160,9 @@ async def fetch_url(
)

if is_page_html and not force_raw:
return extract_content_from_html(page_raw), ""
return extract_content_from_html(
page_raw, use_readability_js=use_readability_js
), ""

return (
page_raw,
Expand Down Expand Up @@ -182,6 +204,7 @@ async def serve(
custom_user_agent: str | None = None,
ignore_robots_txt: bool = False,
proxy_url: str | None = None,
use_readability_js: bool | None = None,
) -> None:
"""Run the fetch MCP server.

Expand Down Expand Up @@ -232,22 +255,32 @@ async def call_tool(name, arguments: dict) -> list[TextContent]:
raise McpError(ErrorData(code=INVALID_PARAMS, message="URL is required"))

if not ignore_robots_txt:
await check_may_autonomously_fetch_url(url, user_agent_autonomous, proxy_url)
await check_may_autonomously_fetch_url(
url, user_agent_autonomous, proxy_url
)

content, prefix = await fetch_url(
url, user_agent_autonomous, force_raw=args.raw, proxy_url=proxy_url
url,
user_agent_autonomous,
force_raw=args.raw,
proxy_url=proxy_url,
use_readability_js=use_readability_js,
)
original_length = len(content)
if args.start_index >= original_length:
content = "<error>No more content available.</error>"
else:
truncated_content = content[args.start_index : args.start_index + args.max_length]
truncated_content = content[
args.start_index : args.start_index + args.max_length
]
if not truncated_content:
content = "<error>No more content available.</error>"
else:
content = truncated_content
actual_content_length = len(truncated_content)
remaining_content = original_length - (args.start_index + actual_content_length)
remaining_content = original_length - (
args.start_index + actual_content_length
)
# Only add the prompt to continue fetching if there is still remaining content
if actual_content_length == args.max_length and remaining_content > 0:
next_start = args.start_index + actual_content_length
Expand All @@ -262,7 +295,12 @@ async def get_prompt(name: str, arguments: dict | None) -> GetPromptResult:
url = arguments["url"]

try:
content, prefix = await fetch_url(url, user_agent_manual, proxy_url=proxy_url)
content, prefix = await fetch_url(
url,
user_agent_manual,
proxy_url=proxy_url,
use_readability_js=use_readability_js,
)
# TODO: after SDK bug is addressed, don't catch the exception
except McpError as e:
return GetPromptResult(
Expand Down
Loading
Loading