diff --git a/src/coding/proxy/routing/executor.py b/src/coding/proxy/routing/executor.py index 537b2b0..7ed3c1c 100644 --- a/src/coding/proxy/routing/executor.py +++ b/src/coding/proxy/routing/executor.py @@ -7,6 +7,7 @@ from __future__ import annotations import logging +import re import time from collections.abc import AsyncIterator from typing import Any @@ -54,16 +55,71 @@ _SESSION_TITLE_MAX_LEN = 30 +# Claude Code 注入的"噪声"标签 — 系统级上下文,不应进入 Session 标题。 +# 这些标签由 CC harness 在首个 user 消息 content 中拼接,高度同质, +# 直接用作标题会导致跨会话标题无差异化,丧失辨识度。 +_NOISE_TAG_PATTERN = re.compile( + r"<(?Psystem-reminder|user-preferences|" + r"local-command-stdout|local-command-stderr|" + r"bash-input|bash-stdout|bash-stderr|" + r"ide_selection|stdin|system_instruction)\b[^>]*>" + r".*?", + flags=re.DOTALL | re.IGNORECASE, +) + +# Slash command 子标签:用于识别 /commit、/review 等命令式调用, +# 合成"命令 + 参数"式标题。 +_CMD_NAME_PATTERN = re.compile(r"(.*?)", flags=re.DOTALL) +_CMD_ARGS_PATTERN = re.compile(r"(.*?)", flags=re.DOTALL) +# 残留 command-* 包裹标签清除(command-message/command-stdout 等次要标签)。 +_CMD_WRAPPER_PATTERN = re.compile( + r".*?", flags=re.DOTALL +) + + +def _sanitize_user_text(raw: str) -> str: + """剔除 Claude Code 注入的系统级 XML 块,还原真实用户输入。 + + 处理顺序: + 1. Slash command 优先识别 — 若检测到 ,合成"命令 + 参数" + 式标题(因为残留文本通常为空,直接取标签内容更有意义)。 + 2. 通用噪声剥离 — 移除已知白名单内的 system-reminder 等标签。 + 3. 残留 command-* 包裹清除 — 兜底去除 command-message 等次要标签。 + 4. 前后空白归一化 — 折叠连续空白为单空格,便于 30 字截断。 + """ + if not raw: + return "" + + # 阶段一: slash command 短路 + cmd = _CMD_NAME_PATTERN.search(raw) + if cmd: + name = cmd.group(1).strip() + args_match = _CMD_ARGS_PATTERN.search(raw) + args = args_match.group(1).strip() if args_match else "" + composed = f"{name} {args}".strip() if args else name + if composed: + return composed + + # 阶段二: 通用噪声剥离 + cleaned = _NOISE_TAG_PATTERN.sub("", raw) + cleaned = _CMD_WRAPPER_PATTERN.sub("", cleaned) + + # 阶段三: 空白折叠 + return re.sub(r"\s+", " ", cleaned).strip() + def _extract_session_title(request: CanonicalRequest) -> str: - """从规范化请求中提取首个用户消息文本作为 session 标题.""" + """从规范化请求中提取首个用户消息文本作为 session 标题。 + + 跳过 Claude Code 注入的系统级 XML 块(system-reminder、user-preferences 等), + 确保标题反映用户真实输入而非高同质化的系统模板。 + """ for part in request.messages: - if ( - part.role == "user" - and part.type == CanonicalPartType.TEXT - and part.text.strip() - ): - return part.text.strip()[:_SESSION_TITLE_MAX_LEN] + if part.role != "user" or part.type != CanonicalPartType.TEXT: + continue + cleaned = _sanitize_user_text(part.text) + if cleaned: + return cleaned[:_SESSION_TITLE_MAX_LEN] return "" diff --git a/tests/test_router_executor.py b/tests/test_router_executor.py index dc37939..6dd630e 100644 --- a/tests/test_router_executor.py +++ b/tests/test_router_executor.py @@ -20,11 +20,14 @@ build_canonical_request, ) from coding.proxy.routing.executor import ( + _SESSION_TITLE_MAX_LEN, _VENDOR_PROTOCOL_LABEL_MAP, + _extract_session_title, _has_tool_results, _is_likely_request_format_error, _log_vendor_response_error, _RouteExecutor, + _sanitize_user_text, ) from coding.proxy.routing.session_manager import RouteSessionManager from coding.proxy.routing.tier import VendorTier @@ -1949,3 +1952,162 @@ def test_returns_body_for_unknown_tier(self): result = exec_inst._prepare_body_for_tier(body, tier, source_vendor="zhipu") assert result is body + + +# ── Session 标题清洗与抽取测试 ───────────────────────────────── + + +class TestSanitizeUserText: + """``_sanitize_user_text`` — 剥离 CC 注入的系统级 XML 块. + + 覆盖典型 system-reminder/user-preferences 噪声、slash command + 短路、空白折叠与边界场景。 + """ + + def test_strips_system_reminder(self): + raw = "MCP 指令这是用户真实输入" + assert _sanitize_user_text(raw) == "这是用户真实输入" + + def test_strips_user_preferences(self): + raw = "用户问题遵循 AGENTS.md" + assert _sanitize_user_text(raw) == "用户问题" + + def test_strips_multiple_noise_blocks(self): + raw = ( + "A" + "B" + "C" + "D" + "真实输入文本" + "P" + ) + assert _sanitize_user_text(raw) == "真实输入文本" + + def test_strips_multiline_system_reminder(self): + """多行 system-reminder 块需被 DOTALL 完整匹配剥离.""" + raw = ( + "\n" + "# MCP Server Instructions\n" + "Use this server to fetch ...\n" + "\n" + "TITLE 中的 Session 标题应当取自用户输入" + ) + assert _sanitize_user_text(raw) == "TITLE 中的 Session 标题应当取自用户输入" + + def test_strips_tag_with_attributes(self): + """容忍标签携带属性(如 ).""" + raw = 'noise真实' + assert _sanitize_user_text(raw) == "真实" + + def test_slash_command_with_args(self): + raw = ( + "commit (user)" + "/commit" + "修复标题" + ) + assert _sanitize_user_text(raw) == "/commit 修复标题" + + def test_slash_command_no_args(self): + raw = "/review" + assert _sanitize_user_text(raw) == "/review" + + def test_collapses_whitespace(self): + raw = "X\n\n 多余 空白\t\t折叠 " + assert _sanitize_user_text(raw) == "多余 空白 折叠" + + def test_empty_after_strip(self): + raw = "仅噪声" + assert _sanitize_user_text(raw) == "" + + def test_empty_input(self): + assert _sanitize_user_text("") == "" + + def test_preserves_user_xml_like_content(self): + """用户输入中合法的 XML/HTML 片段(非白名单标签)需完整保留.""" + raw = "请帮我审查这段代码:
hello
是否符合规范?" + assert _sanitize_user_text(raw) == raw + + def test_strips_local_command_output(self): + raw = "build ok构建后的下一步问题" + assert _sanitize_user_text(raw) == "构建后的下一步问题" + + +class TestExtractSessionTitle: + """``_extract_session_title`` — 端到端从 CanonicalRequest 抽取标题.""" + + @staticmethod + def _build_request(messages: list[dict]): + return build_canonical_request({"model": "test", "messages": messages}, {}) + + def test_truncates_to_max_len(self): + long_text = "用户输入文本" * 20 + req = self._build_request([{"role": "user", "content": long_text}]) + title = _extract_session_title(req) + assert len(title) == _SESSION_TITLE_MAX_LEN + assert title == long_text[:_SESSION_TITLE_MAX_LEN] + + def test_strips_noise_from_first_user_message(self): + raw = ( + "MCP 指令" + "偏好" + "测试标题 ABC" + ) + req = self._build_request([{"role": "user", "content": raw}]) + assert _extract_session_title(req) == "测试标题 ABC" + + def test_handles_real_cc_first_message_shape(self): + """模拟 CC 真实首条消息(多个连续 system-reminder + 用户文本).""" + raw = ( + "\n# MCP Server Instructions\n..." + "\nThe following skills...\n" + "\nPlan mode is active...\n" + "\n\nTITLE 中的 Session 标题应当取自用户输入的信息前 30 个字\n\n" + "始终遵循 AGENTS.md" + ) + req = self._build_request([{"role": "user", "content": raw}]) + title = _extract_session_title(req) + assert title.startswith("TITLE 中的 Session") + assert len(title) <= _SESSION_TITLE_MAX_LEN + + def test_extracts_slash_command(self): + raw = ( + "/commit" + "feat: 新增标题清洗" + ) + req = self._build_request([{"role": "user", "content": raw}]) + assert _extract_session_title(req) == "/commit feat: 新增标题清洗" + + def test_returns_empty_when_only_noise(self): + raw = "纯噪声" + req = self._build_request([{"role": "user", "content": raw}]) + assert _extract_session_title(req) == "" + + def test_returns_empty_for_no_user_messages(self): + req = self._build_request([{"role": "assistant", "content": "你好"}]) + assert _extract_session_title(req) == "" + + def test_skips_noise_only_part_to_find_real_input(self): + """首个 user text part 全噪声时,fallback 到下一个非空 user part.""" + messages = [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "noise", + }, + {"type": "text", "text": "真实问题"}, + ], + } + ] + req = self._build_request(messages) + assert _extract_session_title(req) == "真实问题" + + def test_skips_assistant_role(self): + """assistant 角色的文本不应被作为标题候选.""" + messages = [ + {"role": "assistant", "content": "上一轮回答"}, + {"role": "user", "content": "新的用户问题"}, + ] + req = self._build_request(messages) + assert _extract_session_title(req) == "新的用户问题"